# CLEARING UP WORKSPACE FOR RAM MANAGEMENT.
# 1. Clearing plots.
invisible(if(!is.null(dev.list())) dev.off())
# 2. Cleaning workspace.
rm(list=ls())
# 3. Cleaning console.
cat("\014")# AVOIDING MESSAGES AND WARNINGS.
# Here, the objective is to avoid messages and warnings
# in the final document Tweet_Attribution_Stylometry.html.
# Anyway, messages and warnings, at least those produced
# on my computer, have already been dealt with.
knitr::opts_chunk$set(echo = TRUE,
message = FALSE,
warning = FALSE)
# The next opts_chunk fully deploys figures and centers them.
knitr::opts_chunk$set(out.width = "100%",
fig.align = "center")
# The next instruction facilitates table layout in HTML.
options(knitr.table.format = "html")
# Below this code chunk, there are 3 purely typographical
# commands, just to mark a distinction with the body text. An accuracy level of 92 % has been reached in attributing tweets on the validation set with 212 stylometric predictors, while a baseline model would predict with an accuracy level of 53 % (percentage of the main class). This is deemed to be of valuable predictive quality on very short texts.
This has required long preparation through Data Profiling, Data Wrangling, Exploratory Data Analysis, and Predictor Building.
Tweets come from the account of Candidate Donald Trump during the 2016 US presidential election campaign. Two devices have been used to issue these tweets: an Android device and an iPhone. The challenge is to predict the sending device on the validation set by using stylometric predictors.
Other predictors are available, such as timing, reactions (likes and retweets), content words or sentiments. They might be used later on in other Data Science projects based on the same dataset to further increase accuracy.
Tweet attribution has been operated through Machine Learning with one algorithm: eXtreme Gradient Boosting Tree.
This project is merely technical; it expresses absolutely no political vision or standpoint; it is in no way person-related; the author’s methods, insights, results, and conclusions are only the ones explicitly expressed in this project itself, which only encompasses files lodged with the GitHub repository: https://github.com/Dev-P-L/Tweet_Attribution_Stylometry .
TAGS: stylometry, tweet attribution, Natural Language Processing, Text Mining, Regex, interactive wordcloud, interactive graph, interactive table, Machine Learning, eXtreme Gradient Boosting Tree
GITHUB: https://github.com/Dev-P-L/Tweet_Attribution_Stylometry
This project is based on the dataset trump_tweets from the R package dslabs. This means that usage of this project must strictly comply with all requirements imposed by dslabs and by all dslabs sources.
Dear Readers,
For your convenience, the final document Tweet_Attribution_Stylometry.html is an HTML document with interactive layout: the table of contents, the wordclouds, the graphs, and many tables are interactive; moreover, code can be visualized by pushing tag buttons on the right-hand-side of the HTML document.
Furthermore, for everyone’s convenience, I have tried using, when possible, colors that are clearly distinguishable to take into account alternative color perception, following pieces of advice given in Cookbook R. As far as code visualization is concerned — when pushing tag buttons on the right-hand-side of the HTML document —, I hope the theme espresso, which I have chosen, is satisfactory.
# With a view to providing visual comfort to everybody, most
# colors have been picked up from cbPalette and cbbPalette
# from http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/ .
# Colors assigned to the Android device.
bluish_green <- "#009E73"
# Colors assigned to the iPhone.
gray_palette <- "#999999"
# Palette with both
duo_Palette_bluishgreen_gray <- c("#009E73", "#999999")
# Other colors from cbPalette and cbbPalette
sky_blue <- "#56b4e9"
deep_blue <- "#0072B2"
black <- "#000000"
# Other colors outside of cbPalette and cbbPalette
white <- "#ffffff"
light_sky_blue <- "#f5fafe"
gray <- "gray"You are most welcome to knit the file Tweet_Attribution_Stylometry.Rmd to produce the document Tweet_Attribution_Stylometry.html . For the record, some characteristics of my work environment are visible in the last section of this document, titled R Session Info.
If necessary, numerous R packages are downloaded by the code chunk below:
# PACKAGE CONTAINING THE DATASET.
if(!require(dslabs)) install.packages("dslabs", repos = "http://cran.us.r-project.org")
# PACKAGES ASSOCIATED WITH TIDYVERSE.
# Other packages could be added to this group but have been
# linked below to text processing.
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(scales)) install.packages("scales", repos = "http://cran.us.r-project.org")
if(!require(lubridate)) install.packages("lubridate", repos = "http://cran.us.r-project.org")
if(!require(ggthemes)) install.packages("ggthemes", repos = "http://cran.us.r-project.org")
# PACKAGES ASSOCIATED WITH R MARKDOWN.
if(!require(knitr)) install.packages("knitr", repos = "http://cran.us.r-project.org")
if(!require(kableExtra)) install.packages("kableExtra", repos = "http://cran.us.r-project.org")
if(!require(gridExtra)) install.packages("gridExtra", repos = "http://cran.us.r-project.org")
# PACKAGES RELATED TO NLP AND TEXT MINING.
if(!require(utf8)) install.packages("utf8", repos = "http://cran.us.r-project.org")
if(!require(stringr)) install.packages("stringr", repos = "http://cran.us.r-project.org")
if(!require(tm)) install.packages("tm", repos = "http://cran.us.r-project.org")
if(!require(textreg)) install.packages("textreg", repos = "http://cran.us.r-project.org")
if(!require(quanteda)) install.packages("quanteda", repos = "http://cran.us.r-project.org")
if(!require(tidytext)) install.packages("tidytext", repos = "http://cran.us.r-project.org")
if(!require(textdata)) install.packages("textdata", repos = "http://cran.us.r-project.org")
if(!require(stopwords)) install.packages("stopwords", repos = "http://cran.us.r-project.org")
# PACKAGES ASSOCIATED WITH INTERACTIVE WORDCLOUDS,
# TABLES, AND GRAPHS.
if(!require(devtools)) install.packages("devtools", repos = "http://cran.us.r-project.org")
if(!require(htmltools)) install.packages("htmltools", repos = "http://cran.us.r-project.org")
if(!require(shiny)) install.packages("shiny", repos = "http://cran.us.r-project.org")
if(!require(httpuv)) install.packages("httpuv", repos = "http://cran.us.r-project.org")
if(!require(xtable)) install.packages("xtable", repos = "http://cran.us.r-project.org")
if(!require(sourcetools)) install.packages("sourcetools", repos = "http://cran.us.r-project.org")
if(!require(fastmap)) install.packages("fastmap", repos = "http://cran.us.r-project.org")
if(!require(DT)) install.packages("DT", repos = "http://cran.us.r-project.org")
if(!require(plotly)) install.packages("plotly", repos = "http://cran.us.r-project.org")
# PACKAGES FOR MACHINE LEARNING
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
if(!require(xgboost)) install.packages("xgboost", repos = "http://cran.us.r-project.org")
# INFORMATIONAL PACKAGE
if(!require(states)) install.packages("states", repos = "http://cran.us.r-project.org")
# REQUIRING LIBRARIES.
library(dslabs)
library(tidyverse)
library(scales)
library(lubridate)
library(ggthemes)
library(knitr)
library(kableExtra)
library(gridExtra)
library(utf8)
library(stringr)
library(tm)
library(textreg)
library(quanteda)
library(tidytext)
library(textdata)
library(stopwords)
library(devtools)
library(htmltools)
library(shiny)
library(httpuv)
library(xtable)
library(sourcetools)
library(fastmap)
library(DT)
library(plotly)
library(caret)
library(xgboost)
library(states)
# FROM GITHUB
# Basic
if(!require(githubinstall)) install.packages("githubinstall", repos = "http://cran.us.r-project.org")
library(githubinstall)
# Emojis
devtools::install_github("hadley/emo")
library(emo)
# Repair Tool
# Prevents the function wordcloud2() from silently failing
# after the first wordcloud. For explanation, please see
# https://github.com/Lchiffon/wordcloud2/issues/65 .
devtools::install_github("gaospecial/wordcloud2")
library(wordcloud2)Data are downloaded from the dataset trump_tweets from the R package dslabs.
Tweet texts are standardized to facilitate Natural Language Processing (NLP), Text Mining, and Regex : texts are converted to UTF-8, which replaces curly single quotes and apostrophes by straight one; moreover, curly double quotes are also replaced by straight ones.
Later on, another version of the tweet texts will be produced without standardization, in order to capture some peculiarities.
Here are the eight features from the dataset.
data("trump_tweets")
tweets <- trump_tweets
# Standardization of tweets
tweets_utf8 <- tweets %>%
mutate(text = sapply(text, utf8_normalize, map_quote = TRUE)) %>%
mutate(text = str_replace_all(text, "“|”", '"'))
rm(trump_tweets, tweets)
# Prints data frame description with bg-info layout
# as requested in the chunk header.
str(tweets_utf8, vec.len = 1)## 'data.frame': 20761 obs. of 8 variables:
## $ source : chr "Twitter Web Client" ...
## $ id_str : chr "6971079756" ...
## $ text : chr "From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let's think li"| __truncated__ ...
## $ created_at : POSIXct, format: "2009-12-23 12:38:18" ...
## $ retweet_count : int 28 33 ...
## $ in_reply_to_user_id_str: chr NA ...
## $ favorite_count : int 12 6 ...
## $ is_retweet : logi FALSE ...
Documentation is available at ?trump_tweets in an R session.
Let’s extract the relevant data, that is to say the tweets issued by the Android device and by the iPhone during the 2016 US presidential campaign.
A period has been chosen: the 2016 US presidential campaign, between the day the Candidate Donald Trump announced his campaign until election day. Here are the eight features again, this time from the squeezed dataset.
Moreover, data are kept only for both devices, that is to say the Android device and the iPhone.
# Tweets are kept only if they were sent during the 2016 US presidential
# campaign and by one of the 2 devices, whose names are simplified.
temporary_data_set <- tweets_utf8 %>%
mutate(device = str_replace_all(
str_replace_all(source, "Twitter for Android", "Android"),
"Twitter for iPhone", "iPhone")) %>%
filter(device %in% c("Android", "iPhone") &
created_at >= ymd("2015-06-17") &
created_at < ymd("2016-11-08")) %>%
select(- source)
rm(tweets_utf8)
# Prints data frame description with bg-info layout
# as requested in the chunk header.
str(temporary_data_set, vec.len = 1)## 'data.frame': 3950 obs. of 8 variables:
## $ id_str : chr "682703233492619264" ...
## $ text : chr "I would like to wish everyone A HAPPY AND HEALTHY NEW YEAR. WE MUST ALL WORK TOGETHER TO, FINALLY, MAKE AMERICA"| __truncated__ ...
## $ created_at : POSIXct, format: "2015-12-31 18:21:49" ...
## $ retweet_count : int 6776 2755 ...
## $ in_reply_to_user_id_str: chr NA ...
## $ favorite_count : int 16495 6824 ...
## $ is_retweet : logi FALSE ...
## $ device : chr "Android" ...
The table above shows, among others, that the total number of rows is 3,950.
This is only one fifth of the original dataset but the number of observations suffices in principle to apply Machine Learning algorithms, even if the dataset is split into training set and validation set.
Among the features, id_str is the tweet identifier.
Is it an operational identifier: is it exclusively comprised of unique values?
# Calculates number of observations and number of unique identifiers.
row_number <- nrow(temporary_data_set)
unique_identifiers <-
length(unique(temporary_data_set$id_str))
# Table with both variables
tab <- data.frame(format(row_number, big.mark = " "),
format(unique_identifiers, big.mark = " ")) %>%
`colnames<-`(c("Number of Observations",
"Number of Unique Identifiers"))
rm(row_number, unique_identifiers)
# Prints table with "bg-info" layout.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Number of Observations | Number of Unique Identifiers |
|---|---|
| 3 950 | 3 950 |
The number of unique identifiers is exactly the number of tweets. Which means that the identifier has a different value for each tweet, which is a requirement for an identifier. Consequently, id_str will be kept as the identifier.
The variable source indicates the device that was used to compose and upload each tweet. It is the dependent variable or label or target variable: tweet attribution will be attribution of tweets either to the Android device or to the iPhone. Here is the breakdown of tweets by device.
# Table with breakdown of tweets by device
tab <- data.frame(temporary_data_set$device) %>%
group_by(temporary_data_set$device) %>%
summarize(n = n(),
perc = n * 100 / length(temporary_data_set$device)) %>%
mutate(n = format(n, big.mark = " ")) %>%
mutate(perc = paste(round(perc, 0), "%", sep = " "))
# The next part from this code chunk constructs and prints
# a table with a customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <- c("Device",
"Number of Tweets by Device",
"Percentage of Tweets by Device")
# The next line of code assembles tab and the vector of column names,
# which becomes the first row and appears as the new header, which is
# largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) | Device | Number of Tweets by Device | Percentage of Tweets by Device |
| Android | 1 847 | 47 % |
| iPhone | 2 103 | 53 % |
As shown in the table above, tweets by the iPhone are somewhat more numerous but the difference in percentage is limited.
A baseline model would predict device by attributing to all observations the class with the most occurrences, that is to say the iPhone. This would deliver very low prediction accuracy, or more precisely 53 % accuracy, which is the proportion of iPhone tweets.
Consequently, accuracy appears to be a rather satisfactory performance metric. It will be the performance metric in this project.
The variable text is the tweet itself. Actually, out of the tweets a lot of stylometric predictors — or independent variables — will be extracted, such as emojis, punctuation marks, special sequences of characters, function words — stopwords — and entities — URLs, mentions, and hashtags . This will be substantially developed in the following sections.
The variable created_at contains the date and time at which the tweet was tweeted. It could be essential in another project. It could deliver several predictors such as month, day, hour, etc. This will not be done in this project, which is based on stylometry.
The variable is_retweet is “A logical telling us if it is a retweet or not.” We see in the table above that the first value is FALSE; are there any TRUE values? Even if it is not relevant in this stylometric project, the variability of this feature will be quickly checked: would there be enough variations in this feature to produce an effective predictor in another project?
# Number of FALSE/TRUE values in is_retweet in order to check up
# whether there are TRUE values.
number_true <- sum(temporary_data_set$is_retweet)
number_false <-
length(temporary_data_set$is_retweet) - number_true
# Tab with number of FALSE/TRUE values in is_retweet
tab <-
data.frame(event = c("FALSE", "TRUE"),
number = c(number_false, number_true)) %>%
mutate(number = format(number, big.mark = " ")) %>%
`colnames<-`(c("Is the Tweet Actually a Retweet?",
"Number of Tweets"))
rm(number_true, number_false)
# Prints table with bg-info layout.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Is the Tweet Actually a Retweet? | Number of Tweets |
|---|---|
| FALSE | 3 950 |
| TRUE | 0 |
There is no TRUE value. In the squeezed dataset, this variable shows no variation. This won’t be investigated any further. But there can be some copy-paste inside of tweets, though. Anyway, quotation marks will be extensively taken into account as predictors.
The feature retweet_count indicates “How many times tweet had been retweeted at time dataset was created.” The feature favorite_count is the “Number of times tweet had been favored at time dataset was created.”. Both represent reactions to the Android and iPhone tweets; both are well populated and could be prospective predictors in another project.
Last, in_reply_to_user_id_str means: “If a reply, the user id of person being replied to.” There are a few values.
This project will be based on the variable text.
What about the training set?
In the previous section, we have noticed that the number of observations is almost 4,000 and that the breakdown by device is almost even.
Let’s take a prudent approach when splitting data in order to preemptively avoid suboptimal representation.
The number of rows suffices to reach statistical representativeness even in case of splitting into training set and validation set. It seems unadvisable to split more finely between training set, test set, and validation set, though: this would further reduce the size of samples and, anyway, if necessary, the test set could be replaced by, for instance, bootstrapped resamples.
The splitting proportion will be two thirds for the training set and one third for the validation set. What’s the breakdown by device in the training set?
# Creating the index of the validation set at random.
set.seed(1)
ind_val <-
createDataPartition(y = temporary_data_set$device, times = 1,
p = 1/3, list = FALSE)
# Deducting the index of the training set.
ind_train <- as.integer(setdiff(1:nrow(temporary_data_set), ind_val))
# Creating the training set.
train_utf8 <- temporary_data_set[ind_train, ]
# The validation set will be created at the very end of this project
# for the validation phase.
# The indexes are saved locally with the function write_csv()
# and on https://github.com/Dev-P-L .
write_csv(as.data.frame(ind_val), "ind_val.csv")
write_csv(as.data.frame(ind_train), "ind_train.csv")
rm(temporary_data_set, ind_val)
# Table with tweet count by device
tab <- train_utf8 %>%
group_by(device) %>%
summarise(n = n()) %>%
mutate(n = format(n, big.mark = " "))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <-
c("Device",
"Number of Tweets by Device in the Training Set")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) | Device | Number of Tweets by Device in the Training Set |
| Android | 1 231 |
| iPhone | 1 402 |
First, data have been squeezed to produce the dataset that will be used.
Second, some data profiling has been performed, in a broad way since all available features have been tackled.
Third, the squeezed dataset has been split into a training set and a validation set.
Fourth, data wrangling will be conducted on the training set to organize data in appropriate ways for the Exploratory Data Analysis.
Only three features will be used: the identifier, the tweet texts, and the identification of the device that sent the tweets.
The datasets or data tools will be:
These datasets and data tools are based on tweets with punctuation standardization, curly apostrophes and quotes being replaced with straight ones. Part of this standardization has taken place in the scope of UTF-8 harmonization, which standardizes apostrophes/single quotes; standardization has been extended to double quotes. Later on, a non-harmonized version will be developed in order to capture some peculiarities.
# URLS
# In case 2 URLs might stick to each other, let's use buffer
# with empty space character before URLs when extracting URLs.
buffer <- str_replace_all(train_utf8$text, "http", " http")
# VECTOR OF URLS
# Extracting all URL occurrences as a list of lists.
urls <- str_extract_all(buffer, "https://\\S+|http://\\S+")
rm(buffer)
# Converts a list of lists into a character vector.
urls <- unlist(urls)
# Drops punctuation at the end of extractions.
urls <- str_replace_all(urls, "[:punct:]+$", "")
# Keeps only unique URLs.
urls <- unique(urls)
# URLS AS REGEX PATTERN FOR TEXT MINING
# Escape characters in front of question marks in URLs
urls_with_question_marks <-
str_replace_all(urls, "\\?", "\\\\?")
# URLs assembled into a REGEX pattern for Text Mining.
urls_as_pattern <-
paste(urls_with_question_marks, "|", sep = "", collapse = "")
urls_as_pattern <-
str_replace(urls_as_pattern, "\\|$", "")
# TWEETS WITH URL PLACEHOLDER
buffer <- train_utf8$text
for (i in 1:length(urls)) {
buffer <-
str_replace_all(buffer,
urls_with_question_marks[i],
"URLPLACEHOLDER")
}
train_utf8_no_urls <-
train_utf8 %>%
select(id_str, device) %>%
mutate(text = buffer) %>%
select(id_str, text, device)
rm(i, buffer)
# NUMBER OF URL OCCURRENCES IN EACH TWEET
urls_count <- str_count(train_utf8$text, urls_as_pattern)
# or it could be done with
# str_count(train_utf8_no_urls$text, "URLPLACEHOLDER")
# VECTOR WITH ALL MENTIONS
buffer <- train_utf8_no_urls$text
mentions <- str_extract_all(buffer, "@\\w+")
mentions <- unique(unlist(mentions))
# MENTIONS AS REGEX PATTERN FOR TEXT MINING
mentions_as_pattern <- paste(mentions, "|", sep = "", collapse = "")
mentions_as_pattern <- str_replace(mentions_as_pattern, "\\|$", "")
# NUMBER OF MENTION OCCURRENCES IN EACH TWEET
mentions_count <- str_count(buffer, mentions_as_pattern)
# VECTOR OF ALL HASHTAGS
hashtags <- str_extract_all(buffer, "#\\w+")
hashtags <- unique(unlist(hashtags))
# The last but one line of code discards, among others, the
# sequence #'s since that piece of code accepts only underscore
# as punctuation mark after the pound key #.
# Identifying the sequence #1 to further eliminate it.
index1 <- str_detect(hashtags, "#1\\s|#1(?!_)[:punct:]|#1$")
index1 <- which(index1 ==TRUE)
# Identifying the sequence #2 to further eliminate it.
index2 <- str_detect(hashtags, "#2\\s|#2(?!_)[:punct:]|#2$")
index2 <- which(index2 ==TRUE)
# Identifying the sequence #'s to further eliminate it.
index3 <- str_detect(hashtags, "#1for")
index3 <- which(index3 ==TRUE)
# Regrouping the 3 identifiers.
index <- as.integer(c(index1, index2, index3))
# Inverting the indexation to keep only the rows
# that have not been identified by the 3 identifiers.
index <- setdiff(1:length(hashtags), index)
# Selecting the rows.
hashtags <- hashtags[index]
rm(index, index1, index2, index3)
# HASHTAGS AS REGEX PATTERN FOR TEXT MINING
hashtags_as_pattern <- paste(hashtags, "|", sep = "", collapse = "")
hashtags_as_pattern <- str_replace(hashtags_as_pattern, "\\|$", "")
# NUMBER OF URL OCCURRENCES IN EACH TWEET
hashtags_count <- str_count(buffer, hashtags_as_pattern)
# TWEETS WITH NEITHER URLS NOR MENTIONS NOR HASHTAGS
new <- str_replace_all(buffer,
mentions_as_pattern,
"MENTIONPLACEHOLDER")
new <- str_replace_all(new,
hashtags_as_pattern,
"HASHTAGPLACEHOLDER")
train_utf8_no_urls_mentions_hashtags <-
train_utf8_no_urls %>%
mutate(text = new)
rm(urls_with_question_marks, buffer, new)Exploratory Data Analysis (EDA) will try to obtain insights into tweets in order to better construct prospective predictors for tweet attribution.
EDA will be conducted exclusively on the training set. Tweet texts will be analyzed through NLP, Text Mining, and Regex. Numerous text parts will be extracted in a stylometric approach, especially
Occurrences of analyzed tweet parts will be counted. They will be broken down by device in order to evaluate their discriminant power when attributing tweets.
EDA will extensively use interactive presentation, allowing readers to better associate with insight research; indeed, interactive tables are powerful drilling down tools in text data; interactive wordclouds and graphs can smoothly provide the reader with occurrence information if she/he so wishes.
Colors are also widely utilized as they can facilitate insight perception. Data presentation is in green for the Android device and in gray for the iPhone — specific hues in these colors have been chosen to facilitate alternative color perception. General information is in blue.
Insight gathering will lead to prospective predictors, which will be populate a Machine Learning (ML) model.
The first Text Mining tool will be an interactive table presenting the Android tweets.
# Gathering data.
tab <-
train_utf8 %>%
select(created_at, text, device) %>%
filter(device == "Android") %>%
select(- device) %>%
mutate(created_at = format(created_at, usetz = TRUE)) %>%
as.data.frame() %>%
`colnames<-`(c("DATETIME",
"TWEETS SENT BY THE ANDROID DEVICE"))
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#009E73',
row.style.color = 'white';",
"}",
"}
")
# Prints datatable with the DT package and the JavaScript extensions.
datatable(tab, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))And now, the same will be done for iPhone tweets.
tab <-
train_utf8 %>%
select(created_at, text, device) %>%
filter(device == "iPhone") %>%
select(- device) %>%
mutate(created_at = format(created_at, usetz = TRUE)) %>%
as.data.frame() %>%
`colnames<-`(c("DATETIME", "TWEETS SENT BY THE IPHONE"))
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#999999',
row.style.color = 'white';",
"}",
"}
")
# Prints datatable with the DT package and the JavaScript extensions.
datatable(tab, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))The two interactive tables already show, when compared, numerous differences between Android tweets and iPhone tweets from a stylometric point of view. Here are just a few hasty statements:
Emojis will first be examined.
The next table gives the occurrence number of emojis by device. The last column gives counts adjusted for sample size: indeed, there are a little bit more iPhone tweets and, consequently, the number of emojis produced by the iPhone is proportionately downsized.
# Calculating a comparability factor to proportionally reduce
# iPhone data when comparing the two devices. The comparability
# factor will be kept for further use.
temp <-
train_utf8 %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
comparability_factor <- c(1, temp$n[1] / temp$n[2])
rm(temp)
# Calculates the occurrences of emojis with package hadley/emo.
tab <-
train_utf8_no_urls_mentions_hashtags %>%
select(device, text) %>%
mutate(n = ji_count(text)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(n_reg = round(n * comparability_factor, 0)) %>%
mutate(device = as.character(device))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <- c("Device",
"Occurrences of Emojis",
"Number Sample-adjusted")
# Assembles tab and the vector of column names which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette)| Device | Occurrences of Emojis | Number Sample-adjusted |
| Android | 0 | 0 |
| iPhone | 16 | 14 |
Actually, there are emojis only in iPhone tweets. This is interesting predictive information but the number of occurrences is limited.
The next table shows the tweets that contain emojis.
# Detects tweets containing emojis with package hadley/emo.
tab <-
train_utf8_no_urls_mentions_hashtags %>%
select(text, device) %>%
mutate(ind = ji_detect(text))
index <- which(tab$ind == TRUE)
tab <- tab[index, ]
rm(index)
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#999999',
row.style.color = 'White';",
"}",
"}
")
# Prints the interactive datatable.
datatable(tab, rownames = FALSE,
options =
list(pageLength = 10, scrollX = F,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))The table above shows that there are only 4 tweets that contain emojis. Nevertheless, the prospective predictor will be kept as long as there is no new information invalidating it.
Punctuation marks can be one characteristic of a style, of a twitter. Consequently, it might be productive to pinpoint punctuation marks that are used at different frequencies by the two devices.
In this project, the compound noun punctuation mark also refers to special characters such as +, =, $, and *_* (underscore). For reasons of brevity, punctuation marks and special characters will often be designated as punctuation marks.
Another terminological simplification will usually be applied: generally, the word dot will be used instead of period, full stop, or point, including for dots at the beginning of a tweet — in front of a mention — or for several dots in a row that are no ellipses represented by a single Unicode character U+2026. Sometimes, terminological distinction has been maintained between dot, period, full stop, and point.
The compound noun quotation mark is often simplified to quote: see reference to American Heritage Dictionary here.
Punctuation marks — including special characters — will be looked for in a tweet version that has been arranged, in the section Data Wrangling, to facilitate NLP, Text Mining, and Regex. Indeed, entities — URLs, mentions, and hashtags — have been replaced by placeholders. Moreover, punctuation has been standardized with curly apostrophes and quotes being replaced by straight ones.
At some point, tweet texts without punctuation standardization will be used to capture some peculiarities, that is to say curly apostrophes, curly single quotes, and curly double quotes.
Entities — URLs, mentions, and hashtags — will be dealt with separately.
As a first step, occurrences of single punctuation marks will be counted and visualized in three graphs.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab, initComplete, rowCallback)
single_punctuation_marks <-
c(".", "…", "?", "!", ",", ":", "/", "+", "=", "$",
"_", "-", "–", "—", "(", ")", '"', "'")
list_of_names_single_punctuation <-
c("Dot", "Ellipsis", "Question Mark", "Exclamation Mark", "Comma",
"Colon", "Slash", "Plus", "Equal", "Dollar",
"Underscore", "Hyphen", "En Dash", "Em Dash",
"Left Parenthesis", "Right Parenthesis",
"Double Quote", "Single Quote/ \n Apostrophe")
# A pattern will be assembled now to represent multiple
# punctuation marks that have to be discarded first
# before counting single punctuation marks.
to_be_discarded <-
c("\\.", "\\?", "!", ",", ":", "/", "\\+", "=", "\\$",
"_", "-", "–", "—", "\\(", "\\)", '\\"', "\\'")
to_be_discarded <-
paste(to_be_discarded, "{2,}", "|", sep = "", collapse = "")
to_be_discarded <- str_replace(to_be_discarded, "\\|$", "")
# Two special sequences will be discarded as well: numbers followed
# by a percent sign and the sequence & both will be treated
# with other special sequences below.
to_be_discarded <-
paste(to_be_discarded, "\\d+%|&", sep = "", collapse = "")
# Discards multiple punctuation marks from tweets
# just for counting single punctuation marks.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
rm(to_be_discarded)
# Receptacle data frame for numbers of occurrences
output <-
data.frame(matrix(length(single_punctuation_marks) * 3,
nrow = length(single_punctuation_marks),
ncol = 3) * 1) %>%
`colnames<-`(c("punct", "Android", "iPhone"))
# For loop to compute the occurrences of single punctuation marks.
for (i in 1:length(single_punctuation_marks)) {
occurrence_punctuation_marks <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(number = str_count(text,
fixed(single_punctuation_marks[i]))) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names_single_punctuation[i]
output[[i, 2]] <- occurrence_punctuation_marks$number[1]
output[[i, 3]] <- occurrence_punctuation_marks$number[2]
}
rm(i, buffer, list_of_names_single_punctuation,
occurrence_punctuation_marks)
# Three presentation graphs according to size will be drawn
# instead of one in order to foster readability.
# FIRST GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies in descending order,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] >= 100 | .[3] >= 100) %>%
gather(key = device, value = n, "Android":"iPhone")
# Giving values to 6 arguments from the graphic function
# that will be coded just after.
graph_title <- "Single Punctuation Usage by Device - Graph 1"
x_title <- ""
y_title <- "Number of Occurrences"
angle <- 30
name1 <- "Punctuation Mark"
name2 <- "Number of Occurrences"
# FUNCTION TO PRODUCE THE 3 GRAPHS (and many others later on)
graphic_function <- function(data, title1, title2, title3,
angle, name1, name2) {
graph <- data %>%
ggplot(aes(x = reorder(punct, -n), y = n, fill = device)) +
geom_bar(stat='identity', position = "dodge") +
# Specifies labels.
labs(title = title1,
x = title2,
y = title3) +
theme(plot.title = element_text(hjust = 0.5, vjust = 3,
size = 16, face = "bold",
color = deep_blue),
axis.title.x = element_text(vjust = 2, size = 14,
color = deep_blue),
axis.title.y = element_text(vjust = 2, size = 14,
color = deep_blue),
legend.title = element_blank(),
axis.text.x = element_text(hjust = 0.5, angle = angle,
size = 12, color = deep_blue),
axis.text.y = element_text(size = 12, color = deep_blue),
legend.text = element_text(size = 14, face = "bold",
color = deep_blue),
legend.background = element_rect(fill = white),
legend.position = "bottom",
# Removes vertical grid lines.
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
# Colors horizontal grid lines.
panel.grid.major.y = element_line(color = sky_blue,
size = 2),
panel.grid.minor.y = element_line(color = sky_blue,
size = 2),
# Formats axis ticks.
axis.ticks.x = element_line(size = 25, color = deep_blue),
axis.ticks.y = element_blank(),
panel.border = element_blank(),
# Specifies background colors.
panel.background = element_rect(fill = white),
plot.background = element_rect(fill = white)) +
# Specifies device colors.
scale_fill_manual(values = duo_Palette_bluishgreen_gray)
# Makes the graph interactive.
p <- ggplotly(graph, height = 500) %>%
layout(legend = list(orientation = "h", x = 0.3, y = -0.3),
hoverlabel = list(bordercolor = white))
rm(graph)
# Clarifies hover information.
for (i in 1:2) {
p$x$data[[i]]$text <-
str_replace(p$x$data[[i]]$text,
"reorder\\(punct, -n\\)",
name1)
p$x$data[[i]]$text <-
str_replace(p$x$data[[i]]$text, "n:", paste(name2, ":", sep = ""))
p$x$data[[i]]$text <-
str_replace(p$x$data[[i]]$text, "device", "Device")
}
# Centers the graph, because the centering opts_chunk previously
# inserted has not proved fully operative with function ggplotly().
htmltools::div(p, align = "center")
}
graphic_function(output_partial, graph_title, x_title, y_title,
angle, name1, name2)The Android and iPhone occurrence numbers diverge most, at least in proportion to each other, for double quotes and colons. Both double quotes and colons seem to bring valuable predictive information.
On the contrary, for hyphens and exclamation marks, the two devices diverge very moderately in proportion to each other. But, that is no reason to drop these variables. On the contrary, within hyphens and exclamation marks, subgroups of occurrences with larger differences will be looked for; these subgroups could be assembled on the basis, for instance, of characters surrounding these punctuation marks.
The search for more diverging subgroups will be a general one; it will not be limited to features such as hyphens and exclamation marks; it will extend to features such as dots, which show a less substantial but nevertheless solid divergence between devices, and even to features such as double quotes, which already show strong divergences between devices.
As far as double quotes are concerned, the gap between devices had already been noticed when examining the first two interactive tables with Android and iPhone tweets. The divergence in double quotes was expected since we had noticed that the Android device uses double quotes while the iPhone utilizes mostly single quotes: consequently, it is absolutely normal to have much more double quotes in tweets sent by the Android device than in tweets sent by the iPhone.
But shouldn’t the inverse proportion be expected for single quotes, that is to say much more single quotes in iPhone tweets than in Android tweets? Not necessarily. Why? Because it is the same mark as apostrophes! And there are many short forms — contractions such as it’s — on both sides. There are also genitive possessive forms — ’s or s’. Consequently, there are many apostrophes/single quotes as well on the side of the Android device. Later on, apostrophes will be disentangled from single quotes, with a view to hopefully discover a strong occurrence divergence, at least in single quotes if not in apostrophes.
In passing, it can be noticed that the percent sign is not treated in this section: it will be dealt with among special sequences, more specifically in the combinations of numbers directly followed by the percent sign. The same holds for the ampersand symbol — & — and the semi-colon punctuation mark, which will be both treated in the scope of the special sequence &. Outside of these special sequences, semi-colon appears just once, & and % do not appear at all.
A second graph is needed for the selected punctuation marks having occurrence numbers inferior to 100 for both devices but one occurrence number above or equal to 50 for at least for one of the devices.
# SECOND GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] < 100 & .[3] < 100) %>%
filter(.[2] >= 50 | .[3] >= 50) %>%
gather(key = device, value = n, "Android":"iPhone")
# Values for 6 arguments from the graphic function
graph_title <- "Single Punctuation Usage by Device - Graph 2"
x_title <- ""
y_title <- "Number of Occurrences"
angle <- 30
name1 <- "Punctuation Mark"
name2 <- "Number of Occurrences"
# Graphic function defined in previous code chunk.
graphic_function(output_partial, graph_title, x_title, y_title,
angle, name1, name2)Ellipses stand out in occurrence divergence between the two devices since there are no occurrences of ellipsis in Android tweets. Moreover, the number of occurrences is not negligible in iPhone tweets, with a total of 79. It seems reasonable to expect some predictive power.
Left parentheses and right parentheses also offer a nice occurrence disproportion between devices. Actually, disproportion is identical for both punctuation marks, which is not extraordinarily surprising. There is probably duplicate information: once again, subgroups of occurrences will be looked for in order to try to retrieve more predictive information.
Slashes look also like a valuable prospective predictor.
The picture is less impressive on the side of question marks, but, once again, subgroups of occurrences will be looked for by taking into account surrounding characters.
A third graph is necessary for the remaining single punctuation marks.
# THIRD GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] < 50 & .[3] < 50) %>%
gather(key = device, value = n, "Android":"iPhone")
# Value for 6 arguments from the graphic function
graph_title <- "Punctuation Usage by Device - Graph 3"
x_title <- ""
y_title <- "Number of Occurrences"
angle <- 30
name1 <- "Punctuation Mark"
name2 <- "Number of Occurrences"
# Graphic function defined two code chunks above.
graphic_function(output_partial, graph_title, x_title, y_title,
angle, name1, name2)Only the iPhone is present in en dashes, em dashes, and plus signs. Occurrence numbers are very limited but dashes could be a characteristic of the iPhone tweets.
The dollar sign frequencies are not uninteresting. Maybe a subgroup of occurrences can be spotted.
No underscore is reported, which is not so surprising since placeholders have been substituted to entities — URLs, mentions, and hashtags — where underscore characters are present. The entities will be studied later on.
Now focus will be moved to double punctuation marks, without any empty space character in between, such as two hyphens in a row, two dots in a row, etc.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(output, output_partial, graph_title, x_title, y_title,
angle, name1, name2)
# Vector of double punctuation marks to be detected.
punctuation_marks <- c("..", "??", "!!", "--")
list_of_names <-
c("Double Dot", "Double Question Mark",
"Double Exclamation Mark", "Double Hyphen")
# Regex pattern representing triple (or more) punctuation marks
# to be discarded first before counting double punctuation marks.
to_be_discarded <- c("\\.", "\\?", "!", "-")
to_be_discarded <-
paste(to_be_discarded, "{3,}", "|", sep = "", collapse = "")
to_be_discarded <- str_replace(to_be_discarded, "\\|$", "")
# Discards triple (or more) punctuation marks from tweets
# just for counting double punctuation marks.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
rm(to_be_discarded)
# Receptacle data frame for numbers of occurrences of double
# punctuation marks
output <-
data.frame(matrix(length(punctuation_marks) * 4,
nrow = length(punctuation_marks),
ncol = 4) * 1)
# For loop to count occurrences of double punctuation marks
for (i in 1:length(punctuation_marks)) {
occurrence_punctuation_marks <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(number = str_count(text,
fixed(punctuation_marks[i]))) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(n_reg = round(number * comparability_factor, 0)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names[i]
output[[i, 2]] <- occurrence_punctuation_marks$number[1]
output[[i, 3]] <- occurrence_punctuation_marks$number[2]
output[[i, 4]] <- occurrence_punctuation_marks$n_reg[2]
}
rm(i, buffer, occurrence_punctuation_marks)
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from output are removed.
names(output) <- NULL
# A new vector of column names is created.
name <- c("Double Punctuation Mark",
"Occurences from Android",
"Occurrences from iPhone",
"iPhone Number Sample-adjusted")
# Assembles output and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, output)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE) %>%
column_spec(1, color = white, background = deep_blue) %>%
column_spec(2, color = white, background = bluish_green) %>%
column_spec(3, color = white, background = gray) %>%
column_spec(4, color = white, background = gray_palette) %>%
row_spec(1, color = white, background = deep_blue)| Double Punctuation Mark | Occurences from Android | Occurrences from iPhone | iPhone Number Sample-adjusted |
| Double Dot | 1 | 1 | 1 |
| Double Question Mark | 0 | 1 | 1 |
| Double Exclamation Mark | 0 | 2 | 2 |
| Double Hyphen | 1 | 22 | 19 |
Double hyphens show an uneven occurrence breakdown between devices and represent a prospective predictor.
The next graph shows statistics of triple punctuation marks, without any empty space character, such as three dots in a row — not to be confused with ellipses represented by a single Unicode character U+2026.
# Vector of triple punctuation marks to be detected.
punctuation_marks <- c("...", "---")
list_of_names <- c("Triple Dot", "Triple Hyphen")
# Regex pattern of quadruple (or more) punctuation marks that
# will be discarded before counting triple punctuation marks.
to_be_discarded <- c("\\.", "\\?", "!", "-")
to_be_discarded <-
paste(to_be_discarded, "{4,}", "|", sep = "", collapse = "")
to_be_discarded <- str_replace(to_be_discarded, "\\|$", "")
# Discards quadruple (or more) punctuation marks from tweets
# just for counting triple punctuation marks.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
rm(to_be_discarded)
# Receptacle data frame for numbers of occurrences
# of triple punctuation marks
output <-
data.frame(matrix(length(punctuation_marks) * 4,
nrow = length(punctuation_marks),
ncol = 4) * 1)
# For loop to count occurrences of triple punctuation marks.
for (i in 1:length(punctuation_marks)) {
occurrence_punctuation_marks <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(number = str_count(text,
fixed(punctuation_marks[i]))) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(n_reg = round(number * comparability_factor, 0)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names[i]
output[[i, 2]] <- occurrence_punctuation_marks$number[1]
output[[i, 3]] <- occurrence_punctuation_marks$number[2]
output[[i, 4]] <- occurrence_punctuation_marks$n_reg[2]
}
rm(i, buffer, occurrence_punctuation_marks)
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from output are removed.
names(output) <- NULL
# A new vector of column names is created.
name <-
c("Triple Punctuation Mark",
"Occurrences from Android",
"Occurrences from iPhone",
"iPhone Number Sample-adjusted")
# Assembles output and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, output)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE) %>%
column_spec(1, color = white, background = deep_blue) %>%
column_spec(2, color = white, background = bluish_green) %>%
column_spec(3, color = white, background = gray) %>%
column_spec(4, color = white, background = gray_palette) %>%
row_spec(1, color = white, background = deep_blue)| Triple Punctuation Mark | Occurrences from Android | Occurrences from iPhone | iPhone Number Sample-adjusted |
| Triple Dot | 4 | 10 | 9 |
| Triple Hyphen | 2 | 15 | 13 |
Triple hyphens seem interesting. Possibly triple dots as well. In passing, triple dots are considered here as three separate dots and are not to be confused with ellipses represented by a single Unicode character U+2026.
What about quadruple punctuation marks?
# Vector of quadruple punctuation marks to be detected.
punctuation_marks <- c("....", "----")
list_of_names <- c("Quadruple Dot", "Quadruple Hyphen")
# Regex pattern of quintuple (or more) punctuation marks that
# will be discarded before counting quadruple punctuation marks.
to_be_discarded <- c("\\.", "\\?", "!", "-")
to_be_discarded <-
paste(to_be_discarded, "{5,}", "|", sep = "", collapse = "")
to_be_discarded <- str_replace(to_be_discarded, "\\|$", "")
# Discards quintuple (or more) punctuation marks from tweets
# just for counting quadruple punctuation marks.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
# Receptacle data frame for numbers of occurrences
# of quadruple punctuation marks
output <-
data.frame(matrix(length(punctuation_marks) * 4,
nrow = length(punctuation_marks),
ncol = 4) * 1)
# For loop to count the occurrences of quadruple punctuation marks.
for (i in 1:length(punctuation_marks)) {
occurrence_punctuation_marks <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(number = str_count(text,
fixed(punctuation_marks[i]))) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(n_reg = round(number * comparability_factor, 0)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names[i]
output[[i, 2]] <- occurrence_punctuation_marks$number[1]
output[[i, 3]] <- occurrence_punctuation_marks$number[2]
output[[i, 4]] <- occurrence_punctuation_marks$n_reg[2]
}
rm(i, buffer, occurrence_punctuation_marks)
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from output are removed.
names(output) <- NULL
# A new vector of column names is created.
name <-
c("Quadruple Punctuation Mark",
"Android Occurrences",
"iPHone Occurrences",
"iPhone Number Sample-adjusted")
# Assembles output and the vector of column names which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, output)
rm(name, output)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE) %>%
column_spec(1, color = white, background = deep_blue) %>%
column_spec(2, color = white, background = bluish_green) %>%
column_spec(3, color = white, background = gray) %>%
column_spec(4, color = white, background = gray_palette) %>%
row_spec(1, color = white, background = deep_blue)| Quadruple Punctuation Mark | Android Occurrences | iPHone Occurrences | iPhone Number Sample-adjusted |
| Quadruple Dot | 0 | 3 | 3 |
| Quadruple Hyphen | 0 | 0 | 0 |
The number of occurrences of quadruple dots is very limited. But repetition of punctuation marks might be more an iPhone characteristic than an Android characteristic:
Consequently, the number of quadruple dots will be maintained as potential predictive information.
Single quotation marks are now under scrutiny. How could they be detected?
Quotes are used in pairs. But what about a tweet with two apostrophes, for instance a tweet with two short forms? They cannot be considered as quotes because the number of occurrences is an even number!
Consequently, single quote occurrences will be counted by deducting apostrophe occurrences. Apostrophe occurrences will be grammatically detected, at least most of them as being present in some sequences. Here are the cases in which apostrophes will be detected and deducted:
Apostrophes in possessive forms ending with s’ will not be deducted; indeed, doing so might also eliminate quotation marks.
But these apostrophes will be hopefully deducted by applying the rule no odd number of single quotes: if, after deducting all abovementioned cases, the occurrence number of the punctuation mark ’ in any tweet is an odd number, the occurrence number will be automatically reduced by 1. Of course, this rule would not only apply in the case of possessive forms ending with s’ but also in all cases where an apostrophe has not been deducted.
Can that rule suffice to eliminate all remaining apostrophes? Of course, that rule would not meet the challenge of deducting two or more apostrophes remaining in a tweet. But this scenario never happens in tweets from the training set, as that will be checked in a detailed way. Consequently, in the training set, the rule no odd number of single quotes suffices to eliminate all remaining apostrophes, which are few.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(punctuation_marks, list_of_names, tab)
# In order to facilitate text mining,
# tweet texts will be lowercased since the short forms are
# already lowercased in the package stopwords.
train_utf8_no_urls_no_apostrophes <-
train_utf8_no_urls_mentions_hashtags %>%
select(id_str, text, device) %>%
mutate(text = str_to_lower(text, locale = "en"))
# Apostrophes will be removed if they are in
# - possessive forms 's,
# - plural forms such as $'s,
# - enclosed apostrophes, in, for instance, short forms or
# some family names like O'Reilly,
# - apostrophes in abbreviated two-digit format for years,
# - apostrophes at the end of 2 colloquialisms: "ya'" and "lyin'".
# Apostrophes in possessive forms ending with s' will not be removed
# because this might remove quotation marks as well. That problem
# will be tackled below in a completely different way
# by the rule "no odd number of single quotes".
train_utf8_no_urls_no_apostrophes <-
train_utf8_no_urls_no_apostrophes %>%
mutate(text = str_replace_all(text,
"([^\\s])(\\')(s)", "\\1\\3")) %>%
mutate(text = str_replace_all(text,
"([A-Za-z])(\\')([A-Za-z])", "\\1\\3")) %>%
mutate(text = str_replace_all(text,
"(\\')(\\d{2})", "\\2")) %>%
mutate(text = str_replace_all(text,
"(ya)(\\')", "\\1")) %>%
mutate(text = str_replace_all(text,
"(lyin)(\\')", "\\1")) %>%
mutate(text = str_replace_all(text,
"(#)(\\')(s)", "\\1\\3")) %>%
mutate(text = str_replace_all(text,
"(\\$)(\\')(s)", "\\1\\3"))
# There can still remain apostrophes, irrespective of single
# quotation marks, even if their number is probably very limited.
# The number of single quotation marks should be an even number.
# If there remains one apostrophe (e.g. in a possessive form
# of the type "s'" as in "parents'"), then the global number
# of "'" should be an odd number. In such a case, the number
# of occurrences will be diminished by 1.
# Of course, this wouldn't solve the problem if there were
# two or more apostrophes left, but this will be checked below.
train_utf8_no_urls_no_apostrophes <-
train_utf8_no_urls_no_apostrophes %>%
mutate(number = str_count(text, "\\'"))
for (i in 1:nrow(train_utf8_no_urls_no_apostrophes)) {
# If the number of ' is an odd number ...
if ((train_utf8_no_urls_no_apostrophes$number[i] %% 2) > 0) {
# ... then the number is reduced by 1.
train_utf8_no_urls_no_apostrophes$number[[i]] <-
train_utf8_no_urls_no_apostrophes$number[i] - 1
}
}
rm(i)
# Now, the number of single quotation marks can be computed.
tab <-
train_utf8_no_urls_no_apostrophes %>%
group_by(device) %>%
summarise(number = sum(number)) %>%
mutate(n_reg = round(number * comparability_factor, 0))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <- c("Device",
"Occurrence Number of Single Quotes",
"Number Sample-adjusted")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette)| Device | Occurrence Number of Single Quotes | Number Sample-adjusted |
| Android | 0 | 0 |
| iPhone | 108 | 95 |
As the table above shows, disentangling single quotes and apostrophes delivers an impressive result: there are 108 single quote occurrences in iPhone tweets and none in Android tweets. This looks like a promising prospective predictor.
Has the rule no odd number of single quotes worked perfectly?
In order to answer that question, two interactive tables — one for each device — will be produced with three columns:
The first table is about Android tweets. It has been inserted although the above disentangling process did not count any single quote in Android tweets. In the Android table, we can check whether the rule no odd number of single quotes appropriately rejected some apostrophes before arriving at a zero number of remaining single quotes.
# Index of tweets with quotes and possibly some remaining apostrophes
index <-
str_detect(train_utf8_no_urls_no_apostrophes$text, "\\'")
index <- which(index == T)
# Table with such tweets from the Android device, without column names
tab <-
train_utf8_no_urls_no_apostrophes[index, ] %>%
filter(device == "Android") %>%
select(- device) %>%
`colnames<-`(NULL)
rm(index)
# Vector of column names
name <- c("Identifier",
"Android Tweet with Single Quotation Marks or Remaining Apostrophes",
"How Many Single Quotes Have Been Detected?")
# Constructs a table, taking the vector of column names
# as a row for formatting convenience.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green)| Identifier | Android Tweet with Single Quotation Marks or Remaining Apostrophes | How Many Single Quotes Have Been Detected? |
| 690529690205818880 | hashtagplaceholdercruz eligibility to be president not settled law, says cruz’ constitutional law professor, hashtagplaceholder urlplaceholder" | 0 |
The rule no odd number of single quotes has worked correctly! Indeed, there is one ’ mark in this tweet and the rule has rejected that occurrence when counting the number of single quotes, which is correct since this is an apostrophe in “Cruz’ constitutional law professor”.
Cruz’ has been preferred to Cruz’s. In Cruz’s, the apostrophe would have been discarded by the cleaning process above. It is not discarded in Cruz’ but this is compensated by the rule no odd number of single quotes: since the number of ’ marks is one and thus an odd number, it is decreased by one and so becomes zero. Consequently, the count of quotation marks is correct: there is no quotation mark in that tweet.
Readers interested in that grammatical usage could refer, for instance, to this page .
This is it for the Android tweets. Now the iPhone tweets.
For the iPhone, the disentangling process above has counted 108 occurrences of single quotes. The next table will show all iPhone tweets that still have the ’ mark. Two questions should be answered on the basis of the next table:
# Index of tweets with quotes and possibly some remaining apostrophes
index <-
str_detect(train_utf8_no_urls_no_apostrophes$text, "\\'")
index <- which(index == T)
# Table with such tweets from the Android device
tab <-
train_utf8_no_urls_no_apostrophes[index, ] %>%
filter(device == "iPhone") %>%
select(- device) %>%
`colnames<-`(c("Identifier",
"iPhone Tweet with Single Quotation Mark or Remaining Apostrophes",
"Quotation Marks"))
rm(index, train_utf8_no_urls_no_apostrophes)
# Creating an interactive data table, using the DT package.
# This JavaScript extension Will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension Will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#999999',
row.style.color = 'White';",
"}",
"}
")
# Prints the interactive datatable.
datatable(tab, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))Once again, the count of single quotation marks is correct!
All single quotes in the tweets from the second column have been counted as single quotes in the third column.
There is one apostrophe in the tweet with identifier 736410143378792448: it is an apostrophe in the possessive form included in refugees’ social media accounts. Very correctly, this apostrophe has not been counted as a single quote in the third column, which shows zero in that row. Indeed, since the number of occurrences of the punctuation mark ’ is 1 in that row, that is to say an odd number, the rule no odd number of single quotes has decreased the number by 1 and the number has become … zero.
Grammatically distinguishing single quotes and apostrophes has lead to a promising prospective predictor: the number of single quotes, which have been used only by the iPhone in the training set tweets. The number of apostrophes is not so different between devices, especially so if sample size is taken into account.
As far as double quotes are concerned, the Android device dominates clearly, which is not surprising since it does not use any single quote … Double quote count is also a promising prospective predictor.
Consequently, there are until now three prospective predictors based on quotes and apostrophes:
These prospective predictors have been produced by drilling down through tweets that have been standardized by a two-way process:
This has simplified Exploratory Data Analysis and has fostered the splitting between single quotes and apostrophes. It has also deprived analysis from part of the information. The hidden part of information needs fetching back now, in order to mine into increased typographical diversity.
After standardization, two typographical (or computing) characters had been spotted:
The Android device uses straight apostrophes and straight double quotes.
Without that standardization, five typographical characters have been detected:
After checking, which is not shown here for reasons of brevity, in iPhone tweets, there are straight single quotes and straight apostrophes (same character) and there are curly single quotes and curly apostrophes (same character). Consequently, the typographical difference between straight and curly single quote/apostrophe was anyway no solution to differentiate single quotes and apostrophes, which has been done completely differently before — by using surrounding characters.
As far as single quotes and apostrophes are concerned, the iPhone uses more straight ones than curly ones.
As far as quotes are concerned, the iPhone uses mainly single quotes but also some straight double quotes and a few curly double quotes (opening ones and closing ones).
The next table shows the number of curly characters used by the iPhone in the training set tweets. Of course, this is based on the non-standardized training set.
# Recalling the original dataset in order to have a
# non-standardized version.
data("trump_tweets")
tweets <- trump_tweets
rm(trump_tweets)
# The device names are kept only for both devices
# we are interested in. Moreover, they are simplified.
# The tweets are kept only if they were tweeted between the day
# Candidate Donald Trump announced his campaign and election day.
temporary_data_set <- tweets %>%
mutate(device = str_replace_all(
str_replace_all(source, "Twitter for Android", "Android"),
"Twitter for iPhone", "iPhone")) %>%
filter(device %in% c("Android", "iPhone") &
created_at >= ymd("2015-06-17") &
created_at < ymd("2016-11-08")) %>%
select(- source)
rm(tweets)
# Creating the training set.
train_no_utf8 <- temporary_data_set[ind_train, ]
rm(temporary_data_set, ind_train)
tab <- train_no_utf8 %>%
select(text, device) %>%
mutate(n = str_count(text, "’|‘|“|”")) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device)) %>%
mutate(n_reg = round(n * comparability_factor, 0))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <-
c("Device",
"Number of Curly Single Quotes, Curly Double Quotes, and Curly Apostrophes",
"Number Sample-adjusted")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) %>%
column_spec(1, width = "1.5in") %>%
column_spec(2, width = "3.5in") | Device | Number of Curly Single Quotes, Curly Double Quotes, and Curly Apostrophes | Number Sample-adjusted |
| Android | 0 | 0 |
| iPhone | 70 | 61 |
There is a clear-cut difference between the Android device and the iPhone with respect to curly quotes and curly apostrophes: the Android device uses none of them. Obviously, the number of curly characters in each tweet might be a valuable prospective predictor.
In a first step, a general prospective approach will be developed about all enclosed punctuation sequences — that is to say one punctuation mark or several punctuation marks in a row — that are surrounded by characters that are no empty space characters … and no punctuation marks. That general approach is comprised of
In a second step, the general approach will be fine-tuned towards some single punctuation marks enclosed in non-empty space characters.
Just as in the previous section, research will be based on tweet texts where entities — URLs, mentions, and hashtags — have been replaced by placeholders that contain no punctuation. Moreover, it is the standardized text version — so without any curly quote or apostrophe.
So, the next graph shows the dispersion of enclosed punctuation sequences.
# Generates a random placeholder for enclosed punctuation sequences.
set.seed(1)
enclosed_punctuation_placeholder <-
paste(sample(c(0:9, letters, LETTERS), 16), collapse = "")
# Replaces enclosed punctuation sequences by placeholder: round 1!
text <- train_utf8_no_urls_mentions_hashtags$text
text <-
str_replace_all(text,
"((?![:punct:])[\\S])([:punct:]+)((?![:punct:])[\\S])",
paste("\\1", enclosed_punctuation_placeholder, "\\3"))
# In case different enclosed punctuation sequences are separated
# by only one non-empty space character, some sequences
# can escape replacement. In, for instance, S.C./N.H./N.Y.
# only three sequences are replaced: S.C N.H N.Y
# Two sequences would not be replaced: C./N H./N
# In order to have, in this example, 5 occurrences of the placeholder,
# let's launch a second replacement round.
text <-
str_replace_all(text,
"((?![:punct:])[^\\s])([:punct:]+)((?![:punct:])[^\\s])",
paste("\\1", enclosed_punctuation_placeholder, "\\3"))
# Data frame with enclosed punctuation sequences being identified.
tweets_with_enclosed_punctuation <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = text) %>%
mutate(index = str_count(text, enclosed_punctuation_placeholder))
rm(enclosed_punctuation_placeholder, text)
# We'll keep the data frame tweets_with_enclosed_punctuation
# for printing in the next code chunk.
# In this code chunk, let's count the number of tweets that have
# a specific number of occurrences of enclosed punctuation sequences.
df <- tweets_with_enclosed_punctuation %>%
select(- text) %>%
mutate(device = as.factor(device),
index = as.factor(index)) %>%
group_by(device, index, .drop = FALSE) %>%
summarise(n = n()) %>%
filter(n > 12) %>%
# Reformats for the user-defined function graphic_function().
mutate(punct = index) %>%
select(punct, device, n) %>%
mutate(device = as.character(device),
punct = as.integer(punct))
# Parameters for the function graphic_function
graph_title <- "Enclosed Punctuation Sequences"
x_title <- "Number of Enclosed Punctuation Sequences in a Tweet"
y_title <- "Number of Tweets"
angle <- 0
name1 <- "Number of Enclosed Punctuation Sequences in a Tweet"
name2 <- "Number of Tweets"
graphic_function(df, graph_title, x_title, y_title, angle, name1, name2)In absolute numbers, the biggest difference between devices is in tweets with no enclosed punctuation sequence. In proportion, the biggest difference is in tweets with 2 or 3 enclosed punctuation sequences, the Android device representing more than 70 %.
Maybe it is possible to find subgroups of the tweets with bigger difference between devices. In order to potentially delineate such subgroups, let’s have a look at the tweets that contain at least one enclosed punctuation sequence.
For each selected tweet, the number of enclosed punctuation sequences is indicated in the last column. To avoid any misunderstanding, let’s remember that enclosed punctuation sequences are defined as punctuation sequences that are surrounded by characters that are no empty space characters … and no punctuation marks. Let’s take an example: when he is so low in the polls?" Because of his big $ hit ads on me! In that extract, no enclosed punctuation sequence will be detected since ?" is followed by an empty space character! This will be different below when enclosed single punctuation marks are analyzed: then the exclamation mark in that extract will be counted as an enclosed question mark.
Tweets with enclosed punctuation sequences are presented by device, beginning alphabetically with the Android device.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(df, graph_title, x_title, y_title, angle, name1, name2)
# Table with enclosed punctuation sequences in Android tweets
tab <- train_utf8_no_urls_mentions_hashtags %>%
select(device, text) %>%
mutate(index = tweets_with_enclosed_punctuation$index) %>%
filter(device == "Android" & index > 0) %>%
select(text, index) %>%
`colnames<-`(c("ANDROID TWEET",
"ENCLOSED PUNCTUATION SEQUENCE"))
# Creating an interactive data table, using the DT package.
# JavaScript extension that Will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# JavaScript extension that will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#009E73',
row.style.color = 'White';",
"}",
"}
")
# Prints the interactive datatable.
datatable(tab, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))Let’s use this visualization tool to drill down into these tweets which contain enclosed punctuation sequences and fine-tune analysis towards specific punctuation marks.
Apostrophes are numerous; that is no big news, since the previous section has pinpointed numerous apostrophes in short forms, in possessive forms, etc. Enclosure in short forms will be specifically dealt with below, in a section dedicated to stopwords.
Enclosed dots and enclosed hyphens also stand out. They will be further analyzed in this section.
There are also several numbers that contain one or two commas as thousand-separators. These will be analyzed in the section about special sequences.
Many other punctuation marks will show up below.
The next table presents tweets sent by the iPhone and containing at least one enclosed punctuation sequence.
# Table with enclosed punctuation sequences in iPhone tweets
tab <- train_utf8_no_urls_mentions_hashtags %>%
select(device, text) %>%
mutate(index = tweets_with_enclosed_punctuation$index) %>%
filter(device == "iPhone" & index > 0) %>%
select(text, index) %>%
`colnames<-`(c("IPHONE TWEET",
"ENCLOSED PUNCTUATION SEQUENCE"))
rm(tweets_with_enclosed_punctuation)
# Creating an interactive data table, using the DT package.
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#999999',
row.style.color = 'White';",
"}",
"}
")
# Prints interactive datatable.
datatable(tab, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))There are numerous apostrophes and, once again, this is not surprising as already pointed out about Android tweets.
Enclosed slashes can be noticed. They will be specifically analyzed just below with other specific punctuation marks. They will come back in the section about special sequences because they often show up in dates.
Enclosed colons often show up in time indications and commas in decimal numbers. That will be tackled in the section dedicated to special sequences.
Specific punctuation marks will be counted now if they are enclosed between non-empty space characters, which can be … other punctuation marks. A breakdown by device is provided.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab, initComplete, rowCallback)
# Punctuation marks that will be looked for.
punctuation_marks <- c("\\.", "\\?", "!", ",", ":", "/",
"%", "-", "\\(", "\\)")
list_of_names <- c("Dot", "Question Mark", "Exclamation Mark",
"Comma", "Colon", "Slash", "Percentage",
"Hyphen", "Left Parenthesis",
"Right Parenthesis")
# Receptacle data frame for occurrence counts.
output <-
data.frame(matrix(length(punctuation_marks) * 3,
nrow = length(punctuation_marks),
ncol = 3) * 1) %>%
`colnames<-`(c("punct", "Android", "iPhone"))
# Repetitions of punctuation marks are removed.
for (i in 1:length(punctuation_marks)) {
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <-
str_replace_all(buffer,
paste(punctuation_marks[i], "{2,}", sep = ""),
"")
pattern <-
paste("(?![:punct:])[\\S]", punctuation_marks[i],
"(?![:punct:])[\\S]", sep = "")
occurrence_punctuation_mark <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(number = str_count(text, pattern)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names[i]
output[[i, 2]] <- occurrence_punctuation_mark$number[1]
output[[i, 3]] <- occurrence_punctuation_mark$number[2]
}
rm(i, occurrence_punctuation_mark)
# Presentation graph with numbers of enclosed punctuation marks
# sorted in descending order to foster readability.
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] > 1 | .[3] > 1) %>%
gather(key = device, value = n, "Android":"iPhone")
# Parameter values for user-defined graphic_function()
graph_title <- "Enclosed Single Punctuation Mark"
x_title <- ""
y_title <- "Number of Tweets"
angle <- 30
name1 <- "Punctuation Mark"
name2 <- "Number of Tweets"
# Running graphic_function() .
graphic_function(output_partial, graph_title, x_title,
y_title, angle, name1, name2)The title reads Enclosed Single Punctuation Mark because the graph is about punctuation marks that are enclosed between non-empty space characters … to the exclusion of themselves. This means that repetitions of these punctuation marks — such as three dots in a row — are first removed. Indeed, repetitions of punctuation marks have already been dealt with.
Enclosed single dot, slash, colon, hyphen, and comma look like good potential predictors.
The occurrence breakdown by device is particularly impressive for enclosed single dots, with 90 % for the Android device. This will be later combined with additional information about, among others, decimal numbers and abbreviations with dots, which will be studied in the section devoted to special sequences.
For slashes, the occurrence breakdown by device is particularly interesting since it shows a strong predominance of the iPhone, which is inverse with respect to the other enclosed single punctuation marks mentioned in the graph above. Consequently, isolating enclosed slashes from these other enclosed punctuation marks gives off valuable information, which would otherwise be watered down in a general group with predominance of the Android device.
That information about slashes will be combined with additional information about slashes included in dates, which will be studied in the section treating special sequences.
For enclosed colons, the occurrence breakdown by device indicates 71 % in favor of the Android device. This too has to be combined with additional information relating to thousand-separators in numbers, which will be studied among special sequences.
Enclosed hyphens also show a predominance of 71 % in favor of the Android device. This too will be further investigated.
special_sequences <-
c("\\\n",
"\\&",
"\\S\\)\\S",
"[^[:punct:]]![^[:punct:]]",
"[a-zA-Z]-\\s",
"\\s-\\s",
"[a-zA-Z]-[a-zA-Z]",
"[a-z]\\.\\s",
"^\\.",
"!$",
"\\?$",
"\\.$",
"[:alnum:]$",
"\\d+",
"[\\s\\$]\\d{1,3},\\d{3}[\\s[:punct:][a-zA-Z]][^\\d]",
"[\\s\\$]\\d{1,3},\\d{3},\\d{3}[\\s[:punct:][a-zA-Z]][^\\d]",
"\\d+\\.\\d+",
"\\d+\\%",
"\\sA.M.|\\sP.M.",
"\\d{1,2}\\:\\d{2}(am|pm)",
"[^\\d]\\d{1,2}/\\d{1,2}/(\\d{2}){1,2}[\\s[a-zA-Z][:punct:]]",
"[\\s[:punct:]]U.S.[\\s[:punct:]]",
"U.S.A.",
"Lyin\\'|lyin\\'|LYIN\\'",
"Ya\\'|ya\\'|YA\\'",
"Havn\\'t|havn\\'t|HAVN\\'T",
"^#1|#1[^\\w]|#1$")
list_of_names <-
c("\\n",
"&",
"Right Par. Nested",
"Excl. Mark w/o Punct.",
"Hyphen Left Enc.",
"Disentangled Hyphen",
"Hyphenated \n Compound",
"Period",
"Upfront Dot",
"Tweet End = !",
"Tweet End = ?",
"Tweet End = .",
"Tweet End = Alphanum.",
"Digits",
"1 Thousand-sep.",
"2 Thousand-sep.",
"Decimal Number",
"Number in %",
"A.M. or P.M.",
"00:00am/pm",
"00/00/00(00)",
"U.S.",
"U.S.A.",
"lyin'",
"ya'",
"havn't",
"#1")
# Receptacle data frame for numbers of occurrences
output <-
data.frame(matrix(length(special_sequences) * 3,
nrow = length(special_sequences),
ncol = 3) * 1) %>%
`colnames<-`(c("punct", "Android", "iPhone"))
# For loop to calculate the numbers of occurrences
for (i in 1:length(special_sequences)) {
pattern <- special_sequences[i]
occurrence_special_sequences <-
train_utf8_no_urls_mentions_hashtags %>%
select(text, device) %>%
mutate(number = str_count(text, pattern)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(number = sum(number)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_of_names[i]
output[[i, 2]] <- occurrence_special_sequences$number[1]
output[[i, 3]] <- occurrence_special_sequences$number[2]
}
rm(i, pattern, occurrence_special_sequences)
# There will be four presentation graphs according to size
# in order to foster readability.
# FIRST GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] >= 250| .[3] >= 250) %>%
gather(key = device, value = n, "Android":"iPhone")
# Parameter values for user-defined graphic_function()
graph_title <- "Special Sequences - Graph1"
x_title <- ""
y_title <- "Number of Tweets"
angle <- 30
name1 <- "Special Sequence"
name2 <- "Number of Tweets"
# Running graphic_function() .
graphic_function(output_partial, graph_title, x_title,
y_title, angle, name1, name2)The prospective predictor new line — \n — is powerfully predictive:
The potential predictor Tweet End = Alphanumeric indicates that a tweet ends with an alphanumeric character. It seems to have predictive power as well, as shown by the following characteristics :
The prospective predictor Excl. Mark w/o Punct. refers to single exclamation marks that are not surrounded by punctuation; enclosure in other punctuation marks has been allowed here because that delivers in this case a much more uneven occurrence breakdown by device. While exclamations marks showed only a very slight predominance of the iPhone with 53 %, with this new potential predictor the predominance of the iPhone jumps to 84 %.
Parallelwise, another subgroup of exclamation marks takes the opposite stand. The prospective predictor Tweet End = ! relates to exclamation marks at the very end of tweets. This time, this is a predominance of the Android device with 79 %!
So, the group of exclamation marks, with quasi-equilibrium between devices, has been split into two subgroups with strong and opposite predominance, delivering substantial additional information.
The potential predictor Period is based on dots following a lowercased letter and followed by an empty space character. It might often be a period — or full stop —, which is the reason why it has been named so.
# SECOND GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] < 250 & .[3] < 250) %>%
filter(.[2] >= 50 | .[3] >= 50) %>%
gather(key = device, value = n, "Android":"iPhone")
# Parameter values for user-defined graphic_function()
graph_title <- "Special Sequences - Graph 2"
x_title <- ""
y_title <- "Number of Tweets"
angle <- 30
name1 <- "Special Sequence"
name2 <- "Number of Tweets"
# Running graphic_function().
graphic_function(output_partial, graph_title, x_title,
y_title, angle, name1, name2)All special sequences in the graph above can bring predictive information, maybe with a special mention for the prospective predictors Hyphen Left Enc. and Hyphenated Compound, which show complete predominance of one device. There is a third possible predictor about hyphens, called Disentangled Hyphen, referring to hyphens surrounded by empty space characters; they differ from en dashes and em dashes illustrated above, in the section Punctuation Marks. At the end of the day, the group of hyphens, which was pretty much in equilibrium between the two devices as shown in the section Punctuation Marks, has been split into three subgroups with clear predominance.
# THIRD GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] < 50 & .[3] < 50) %>%
filter(.[2] >= 18 | .[3] >= 18) %>%
gather(key = device, value = n, "Android":"iPhone")
# Parameter values for user-defined graphic_function()
graph_title <- "Special Sequences - Graph 3"
x_title <- ""
y_title <- "Number of Tweets"
angle <- 30
name1 <- "Special Sequence"
name2 <- "Number of Tweets"
# Running graphic_function() .
graphic_function(output_partial, graph_title, x_title,
y_title, angle, name1, name2)All features will be kept as possible predictors, except for 1 Thousand-sep, which relates to numbers with one thousand-separator.
Two predictors show complete predominance of one device: Right Par. Nested and 00:00am/pm.
Right Par. Nested refers to right — closing — parentheses that are surrounded by non-empty space characters. The word nested has been chosen instead of enclosed — as previously done for other punctuation marks. In the case of enclosed, surrounding characters can be neither punctuation nor empty space characters. For right parentheses, excluding only empty space characters and accepting surrounding punctuation has led to a subgroup with tens of occurrences and complete predominance of one device.
The potential predictor 00:00am/pm corresponds to a time format used only by the iPhone.
The possible predictor lyin’ might be one of the characteristics of a style, it can be rather stable and it has been maintained.
# FOURTH GRAPH
# Data wrangling in two steps:
# - filtering the highest frequencies,
# - switching from wide format to tidy format for ggplot2.
output_partial <- output %>%
filter(.[2] < 18 & .[3] < 18) %>%
gather(key = device, value = n, "Android":"iPhone")
# Parameter values for user-defined graphic_function()
graph_title <- "Special Sequences - Graph 4"
x_title <- ""
y_title <- "Number of Tweets"
angle <- 30
name1 <- "Special Sequence"
name2 <- "Number of Tweets"
# Running graphic_function().
graphic_function(output_partial, graph_title, x_title,
y_title, angle, name1, name2)All features will be maintained as possible predictors except for ya’.
The colloquialism havn’t has been considered as a particularity and, for that reason, as a possible characteristic of a style. Consequently, it is maintained. Here is some rather neutral information from Wiktionary about that short form.
U.S.A. constitutes another potential predictor based on an abbreviation with dots, after the abbreviation U.S. and time format A.M._or_P.M.. The whole topic will be reshuffled when the complete set of possible predictors is established.
In the next section, there comes much more information about abbreviations with dots and without dots.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(punctuation_marks, list_of_names, output, output_partial,
graph_title, x_title, y_title, angle, name1, name2)
# List of abbreviations of American States without dots
state_abb_without_dots <- state.abb
# Replacing State abbreviations with placeholder.
buffer <-
train_utf8_no_urls_mentions_hashtags$text
for (i in 1:length(state_abb_without_dots)) {
# If State abbreviation is at the beginning of a tweet.
buffer <-
str_replace_all(buffer,
paste("(^", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"ABBPLACEHOLDER\\2")
# If State abbreviation is in the middle of a tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
# If there are two or more State abbreviations in the
# middle of a tweet but separated only by one punctuation mark
# or one empty space character, such as in "AL/TX", a second
# round is necessary to replace all State abbreviations.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
# If State abbreviation is at the end of a tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
"$)", sep = ""),
"\\1ABBPLACEHOLDER")
}
rm(i, state_abb_without_dots)
# Occurrence table
tab <-
train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text, "ABBPLACEHOLDER")) %>%
group_by(device) %>%
summarise(n = sum(n)) %>%
select(device, n) %>%
mutate(device = as.character(device))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <-
c("Device",
"Occurrences of State Abbreviations without Dots")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) | Device | Occurrences of State Abbreviations without Dots |
| Android | 3 |
| iPhone | 90 |
The difference between devices is big for abbreviations without dots. This looks like good predictive information.
The same exercise has been conducted about State abbreviations with dots, but the number of occurrences is very limited. The very concept has been extended to other abbreviations with two uppercased letters and dots.
# Occurrences of A.M. and P.M. have already been counted. They
# will be discarded to only count other abbreviations with dots.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "\\sA.M.|\\sP.M.", "")
# Discarding occurrences of U.S. and U.S.A. for the same reason.
buffer <-
str_replace_all(buffer, "[\\s[:punct:]]U.S.[\\s[:punct:]]", "")
buffer <- str_replace_all(buffer, "U.S.A.", "")
# First, replacing abbreviations in the middle of tweets
# by placeholders.
# 1st iteration
buffer <-
str_replace_all(buffer,
"(\\s|(?!\\.)[:punct:])([:upper:]\\.){2,}([\\s[:punct:]])",
"\\1ABBPLACEHOLDER\\3")
# 2nd iteration
buffer <-
str_replace_all(buffer,
"(\\s|(?!\\.)[:punct:])([:upper:]\\.){2,}([\\s[:punct:]])",
"\\1ABBPLACEHOLDER\\3")
# Second, replacing abbreviations at the very beginning of tweets
# by placeholders.
buffer <-
str_replace_all(buffer,
"^(([:upper:]\\.){2,})([\\s[:punct:]])",
"ABBPLACEHOLDER\\2")
# Third, replacing abbreviations at the very end of tweets
# by placeholders.
buffer <-
str_replace_all(buffer,
"(\\s|((?!\\.)[:punct:]))([:upper:]\\.){2,}$",
"\\1ABBPLACEHOLDER")
# Counting the number of occurrences.
tab <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text, "ABBPLACEHOLDER")) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <-
c("Device",
"Occurrences of 2-uppercased-letter Abbreviations with Dots")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) | Device | Occurrences of 2-uppercased-letter Abbreviations with Dots |
| Android | 58 |
| iPhone | 8 |
This is promising predictive information, with the Android device producing more than 88 % of such abbreviations, in spite of the iPhone having slightly more tweets.
This table includes neither U.S., nor U.S.A, nor A.M., nor P.M.. For abbreviations and other aspects related to dots, some reshuffle is looming … below in the section about predictor building.
Apostrophes have already been counted. But — as it has already been done many times for other tweet components — it might be useful to build up a subgroup of apostrophes, such as, for instance, apostrophes in short forms like it’s or isn’t.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab)
# First, let's load a list of stopwords from the packaged stopwords.
list_stopwords <-
stopwords::stopwords("en", source = "snowball")
# Second, let's extract the short forms from the stopwords.
index <- str_detect(list_stopwords, "[a-z]+\\'[a-z]+")
index <- which(index == TRUE)
list_short_forms <- list_stopwords[index]
rm(index)
# Third, let's add the short form "havn't", which is used
# by the Android device.
list_short_forms <- append(list_short_forms, "havn't")
# Pattern to count.
list_short_forms_as_pattern <-
paste(list_short_forms, "|", sep = "", collapse = "")
list_short_forms_as_pattern <-
str_replace(list_short_forms_as_pattern, "\\|$", "")
# In order to facilitate text mining, let's lowercase the tweets
# since the short forms are already lowercased in the list.
tab <- train_utf8_no_urls_mentions_hashtags %>%
select(text, device) %>%
mutate(text = str_to_lower(text, locale = "en")) %>%
mutate(n = str_count(text, list_short_forms_as_pattern)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(n_reg = round(n * comparability_factor, 0)) %>%
select(device, n, n_reg) %>%
mutate(device = as.character(device))
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab are removed.
names(tab) <- NULL
# A new vector of column names is created.
name <- c("Device",
"Occurrences of Short Forms",
"Number Sample-adjusted")
# Assembles tab and the vector of column names, which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab)
rm(name)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE,
color = white, background = deep_blue) %>%
row_spec(2, color = white, background = bluish_green) %>%
row_spec(3, color = white, background = gray_palette) | Device | Occurrences of Short Forms | Number Sample-adjusted |
| Android | 183 | 183 |
| iPhone | 119 | 104 |
Short forms are globally more numerous in Android tweets than in iPhone tweets. The more so if sample size is taken into consideration — just as is done in the last column. This could be valuable predictive information.
Short forms can be viewed as a way to build up a subgroup of apostrophes, that is to say the subgroup of the apostrophes that make part of short forms — as has been done in the global table just above.
But one can also drill down and investigate the number of occurrences of some specific short forms. That will be done in the rest of this section. First, an interactive wordcloud with short forms and occurrences of short forms in Android tweets.
# A list of short forms has already been extracted from
# the package stopwords: it is list_stopwords.
# Let's extract short forms from stopwords.
index <- str_detect(list_stopwords, "[a-z]+\\'[a-z]+")
index <- which(index == TRUE)
list_short_forms <- list_stopwords[index]
rm(index)
# Let's add the short form "havn't" used by the Android device.
list_short_forms <- append(list_short_forms, "havn't")
# In order to facilitate text mining, let's lowercase the tweets
# since short forms (from the package stopwords) are lowercased.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for numbers of occurrences of short forms
l <- length(list_short_forms)
output <-
data.frame(matrix(l * 3, nrow = l, ncol = 3) * 1) %>%
`colnames<-`(c("word", "a", "i"))
# For loop to collect numbers of occurrences of short forms
for (i in 1:l) {
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text,
paste("^", list_short_forms[i],"\\s",
"|\\s", list_short_forms[i],"\\s",
"|\\s", list_short_forms[i], "\\$",
sep = ""))) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_short_forms[i]
output[[i, 2]] <- temp$n[1]
output[[i, 3]] <- temp$n[2]
}
rm(buffer, l, i, temp)
# Extract from the data frame output for the Android device
tab_a <- output %>%
select(word, a) %>%
arrange(desc(a)) %>%
head(., 25)
# Creating an interactive wordcloud for the Android device.
set.seed(1)
wordcloud2(tab_a, shape = "square", gridSize = 30,
minRotation = -pi/2, maxRotation = pi/2, rotateRatio = 1/2,
color = bluish_green, backgroundColor = light_sky_blue,
shuffle = FALSE, size = 1)In the Android wordcloud of short forms, the most frequent short forms are negative ones.
# Data frame for the iPhone
tab_i <- output %>%
select("word", "i") %>%
arrange(desc(i)) %>%
head(., 25)
# Creating an interactive wordcloud for the iPhone.
set.seed(1)
wordcloud2(tab_i, shape = "square", gridSize = 30,
minRotation = -pi/2, maxRotation = pi/2, rotateRatio = 1/2,
color = gray_palette, backgroundColor = light_sky_blue,
shuffle = FALSE, size = 1)In the iPhone wordcloud of short forms, the most frequent short forms are also negative ones but their occurrence numbers are smaller.
The short form i’m is more frequent.
Below, an interactive table gives the occurrence number of all short forms.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(output, tab_a, tab_i)
# A list of short forms has been established in the
# code chunk above and has been kept.
list_short_forms <- sort(list_short_forms)
# A for loop will be developed to compute for each short form
# the number of occurrences in Android tweets and in iPhone tweets.
# Tweets will be lowercased since short forms are lowercased.
l <- length(list_short_forms)
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for numbers of occurrences of short forms
output <- data.frame(matrix(l * 4, nrow = l, ncol = 4) * 1)
# Naming columns. Column names will be lodged in variables
# for coding convenience below.
nc1 <- "Short Form"
nc2 <- "Android Occurrences"
nc3 <- "iPhone Occurrences"
nc4 <- "iPhone Number Sample-adjusted"
output <- output %>%
`colnames<-`(c(nc1, nc2, nc3, nc4))
# For loop to collect numbers of occurrences of short forms.
for (i in 1:length(list_short_forms)) {
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text,
paste("^", list_short_forms[i],"\\s",
"|\\s", list_short_forms[i],"\\s",
"|\\s", list_short_forms[i], "\\$",
sep = ""))) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_short_forms[i]
output[[i, 2]] <- temp$n[1]
output[[i, 3]] <- temp$n[2]
output[[i, 4]] <- round(temp$n[2] * comparability_factor[2], 0)
}
rm(l, buffer, i)
# Sorting data frame on column 2.
output <- output %>% arrange(desc(.[2]))
# Creating an interactive data table, using the DT package.
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# Prints the interactive datatable.
datatable(output, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(output) - 1))),
initComplete = JS(initComplete))) %>%
formatStyle(nc1, color = white, backgroundColor = deep_blue) %>%
formatStyle(nc2, color = white, backgroundColor = bluish_green) %>%
formatStyle(nc3, color = white, backgroundColor = gray) %>%
formatStyle(nc4, color = white, backgroundColor = gray_palette) The table above confirms the general prevalence of the Android device in negative short forms.
The affirmative short forms i’m and i’ll — both are lowercased here for code convenience — are more frequent in iPhone tweets.
Some short forms might have some predictive value.
The next table registers the occurrence numbers of all negative short forms together and all affirmative short forms together.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(output, nc1, nc2, nc3, nc4, initComplete)
# We already have a list of short forms, which has been kept.
# Let's split it into negative and affirmative short forms.
# LIST OF NEGATIVE SHORT FORMS
index <- str_detect(list_short_forms, "n't$")
index <- which(index == TRUE)
list_negational_short_forms <- list_short_forms[index]
rm(index)
list_negational_short_forms_as_pattern <-
paste(list_negational_short_forms, "|", sep = "", collapse = "")
list_negational_short_forms_as_pattern <-
str_replace(list_negational_short_forms_as_pattern, "\\|$", "")
# LIST OF AFFIRMATIVE SHORT FORMS
index <- str_detect(list_short_forms, "n't$")
index <- which(index == FALSE)
list_affirmative_short_forms <- list_short_forms[index]
rm(index)
list_affirmative_short_forms_as_pattern <-
paste(list_affirmative_short_forms, "|", sep = "", collapse = "")
list_affirmative_short_forms_as_pattern <-
str_replace(list_affirmative_short_forms_as_pattern, "\\|$", "")
# In order to facilitate text mining, let's lowercase
# the tweets since short forms are already lowercased.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# TABLE FOR NEGATIVE FORMS
# Computing occurrence numbers of negative short forms.
tab1 <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text,
list_negational_short_forms_as_pattern)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device)) %>%
select(device, n)
rm(list_negational_short_forms,
list_negational_short_forms_as_pattern)
# Converting to wide format for presentation.
tab1 <- spread(tab1, device, n) %>%
mutate(name = "Negative Short Forms") %>%
select(name, everything())
# TABLE FOR AFFIRMATIVE FORMS
# Computing occurrence numbers of affirmative short forms.
tab2 <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text,
list_affirmative_short_forms_as_pattern)) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device)) %>%
select(device, n)
rm(list_affirmative_short_forms)
rm(list_affirmative_short_forms_as_pattern)
# Converting to wide format for presentation.
tab2 <- spread(tab2, device, n) %>%
mutate(name = "Affirmative Short Forms") %>%
select(name, everything())
# Joining both tables.
tab_global <- rbind(tab1, tab2[1, ]) %>%
mutate(n_reg = round(.[3] * comparability_factor[2], 0))
rm(tab1, tab2)
# The next part from this code chunk constructs and prints
# a table with a largely customizable header.
# First, column names from tab_global are removed.
names(tab_global) <- NULL
# A new vector of column names is created.
name <- (c("Typology of Short Forms",
"Android Occurrences",
"iPhone Occurrences",
"iPhone Number Sample-adjusted"))
# Assembles tab_global and the vector of column names which becomes
# the first row and appears as a new header largely customizable.
tab <- rbind(name, tab_global)
rm(name, tab_global)
# Prints the table.
knitr::kable(tab, "html", align = "c") %>%
kable_styling(full_width = T, font_size = 16) %>%
row_spec(1, bold = TRUE) %>%
column_spec(1, color = white, background = deep_blue) %>%
column_spec(2, color = white, background = bluish_green) %>%
column_spec(3, color = white, background = gray) %>%
column_spec(4, color = white, background = gray_palette) %>%
row_spec(1, color = white, background = deep_blue) %>%
row_spec(2:3 , bold = F,
extra_css = 'vertical-align: middle !important;')| Typology of Short Forms | Android Occurrences | iPhone Occurrences | iPhone Number Sample-adjusted |
| Negative Short Forms | 143 | 74 | 65 |
| Affirmative Short Forms | 40 | 45 | 40 |
Interestingly enough, the table above shows that the occurrence breakdown by device is very uneven for negative short forms, with predominance of the Android device. On the contrary, for affirmative short forms, the occurrence breakdown by device is almost even.
On this basis, the number of occurrences of negative short forms shows more predictive power.
# A list of stopwords and a list of short forms have already
# been kept. By difference, a list can be deducted with
# stopwords that are no short forms.
list_other_stopwords <-
sort(setdiff(list_stopwords, list_short_forms))
list_other_stopwords_as_pattern <-
paste(list_other_stopwords, "|", sep = "", collapse = "")
list_other_stopwords_as_pattern <-
str_replace(list_other_stopwords_as_pattern, "\\|$", "")
# In order to facilitate text mining, let's lowercase the tweets
# since stopwords are already lowercased.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for occurrence numbers of other stopwords
l <- length(list_other_stopwords)
output <-
data.frame(matrix(l * 3, nrow = l, ncol = 3) * 1) %>%
`colnames<-`(c("word", "a", "i"))
# For loop to collect numbers of occurrences of other stopwords.
for (i in 1:l) {
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text,
paste("^", list_other_stopwords[i],"\\s",
"|\\s", list_other_stopwords[i],"\\s",
"|\\s", list_other_stopwords[i], "\\$",
sep = ""))) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_other_stopwords[i]
output[[i, 2]] <- temp$n[1]
output[[i, 3]] <- temp$n[2]
}
rm(l, buffer, i, temp)
# RECALCULATING FOR I BECAUSE EVERYTHING HAD BEEN LOWERCASED
# Tweets without lowercasing
buffer <- train_utf8_no_urls_mentions_hashtags$text
# Index of I in other stopwords
index <- str_detect(list_other_stopwords, "^i$")
index <- which(index == TRUE)
# Data frame with occurrence number
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(n = str_count(text, "^I\\s|\\sI\\s|\\sI\\$")) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(n)) %>%
mutate(device = as.character(device))
# Substituting in data frame output.
output[[index, 1]] <- "I"
output[[index, 2]] <- temp$n[1]
output[[index, 3]] <- temp$n[2]
rm(index, buffer, temp)
# Data frame for the Android device
tab_a <- output %>%
select(word, a) %>%
arrange(desc(a)) %>%
head(., 40)
# Creating an interactive wordcloud for the Android device.
set.seed(1)
wordcloud2(tab_a, shape = "square", gridSize = 30,
minRotation = -pi/2, maxRotation = pi/2, rotateRatio = 1/2,
color = bluish_green, backgroundColor = light_sky_blue,
shuffle = FALSE, size = 1.2)The wordcloud above shows predominance of the and and. The stopword the appears 994 times and the stopword and appears 574 times. Would they be more frequent than in tweets sent by the iPhone?
# Data frame for the iPhone
tab_i <- output %>%
select("word", "i") %>%
arrange(desc(i)) %>%
head(., 40)
# Creating an interactive wordcloud for the iPhone.
set.seed(1)
wordcloud2(tab_i, shape = "square", gridSize = 30,
minRotation = -pi/2, maxRotation = pi/2, rotateRatio = 1/2,
color = gray_palette, backgroundColor = light_sky_blue,
shuffle = FALSE, size = 1.2)Here, the stopword the predominates less.
The stopword the appears 548 times in iPhone tweets, against 994 times in Android tweets, although there are more iPhone tweets than Android tweets. This big difference can be useful predictive information. This level of predictive information could even be surprising, coming from a very common function word — or stopword.
The stopword and appears only 239 times in the iPhone wordcloud, against 574 times in tweets from the Android device. But here, is it so surprising? Indeed, the iPhone utilizes & as already seen in the section about special sequences… Just as in the case of the stopword the, occurrence count can have some predictive power.
The stopword you is in second position in the iPhone wordcloud. There are 377 occurrences of it, against 120 in Android tweets. The iPhone produces 76 % of the occurrences of you, which is solidly promising in predictivity. In frequency by tweet, the difference between devices would be a little bit smaller since the training set is comprised of 53 % iPhone tweets and 47 % Android tweets…
On the contrary, the stopword I appears 265 times in the iPhone wordcloud, against 462 times in the Android wordcloud; the difference would even be bigger if the comparison were standardized for difference in sample sizes.
Among short forms — the first category of stopwords that has been examined —, there was an uneven occurrence breakdown by device for negative short forms, with predominance of the Android device.
Is there such an uneven occurrence breakdown by device in negative stopwords that are no short forms? Let’s have a look at the next table, which shows the number of occurrences for all stopwords other than short forms.
# There is a list, called list_other_stopwords, already available.
# There is also a data frame called output with the
# occurrence numbers of stopwords other than short forms.
# A column will be added for sample adjustment.
output <- output %>%
mutate(n_reg = round(i * comparability_factor[2], 0)) %>%
arrange(desc(a))
# Naming columns. Column names will be lodged in variables
# for coding convenience below.
nc1 <- "Other Stopword"
nc2 <- "Android Occurrences"
nc3 <- "iPhone Occurrences"
nc4 <- "iPhone Number Sample-adjusted"
output <- output %>%
`colnames<-`(c(nc1, nc2, nc3, nc4))
# Creating an interactive data table, using the DT package.
# This JavaScript extension Will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# Prints the interactive datatable.
datatable(output, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(output) - 1))),
initComplete = JS(initComplete))) %>%
formatStyle(nc1, color = white, backgroundColor = deep_blue) %>%
formatStyle(nc2, color = white, backgroundColor = bluish_green) %>%
formatStyle(nc3, color = white, backgroundColor = gray) %>%
formatStyle(nc4, color = white, backgroundColor = gray_palette) Indeed, the stopword not appears 143 times among Android tweets and 61 times among iPhone tweets, which seems predictively interesting. For no, it is 72 against 40. Both statements go into the same direction as what has been observed about negative short forms, that is to say the predominance of the Android device. Nevertheless, there is a difference between short forms and other stopwords from a global point of view: among short forms, negative ones are the most frequent; among stopwords other than short forms, the negative ones are not the most frequent, by far.
At the end of this exploratory analysis of short forms and other stopwords, it seems appropriate to check up on the coding process. In order to identify and count stopword occurrences in a safe way, selection has been rather tough when using look-around coding: an occurrence of a stopword is counted if and only if it is
This process is intended to prevent false positives. But there were maybe false negatives? In other words, maybe some stopword occurrences have been missed and totals are underestimated. There could be, for instance, occurrences of stopwords followed by punctuation marks such as commas, periods — full stops —, colons, or semi-colons.
It could be argued that this does not matter. A function word — or stopword in Data Science — can be seen as a grammatical concept: see, for instance, this recap. But we are in stylometry, not in grammar. A subgroup can be formed using a combination of a grammatical notion and some slightly restrictive look-around coding, as long as the resulting subgroup makes sense from a predictivity point of view.
Instead of arguing, this opportunity will be seized to explore further: let’s quantify what some more flexible look-around coding would deliver in terms of occurrence breakdown by device. If breakdowns were more uneven, that would be a sufficient reason to move to alternative coding. By the way, this has not been done for short forms because short forms have seemed more securely identifiable with their apostrophe in the middle.
As a try, let’s move to less restrictive coding in identifying stopwords other than short forms and let’s draw a new interactive table.
# Tweets without URLs, mentions or hashtags and lowercased
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for numbers of occurrences of stopwords
# other than short forms with more flexible criteria
l <- length(list_other_stopwords)
output <-
data.frame(matrix(l * 4, nrow = l, ncol = 4) * 1)
# For loop to collect numbers of occurrences of other stopwords
for (i in 1:l) {
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(blub = str_count(text,
paste("^", list_other_stopwords[i], "\\s|^",
list_other_stopwords[i], "[^\\.\\'][:punct:]|\\s",
list_other_stopwords[i], "\\s|\\s",
list_other_stopwords[i], "[^\\.\\'][:punct:]|[^\\.\\'][:punct:]",
list_other_stopwords[i], "\\s|[^\\.\\'][:punct:]",
list_other_stopwords[i], "[^\\.\\'][:punct:]|\\s",
list_other_stopwords[i], "\\$|[^\\.\\'][:punct:]",
list_other_stopwords[i], "\\$", sep = ""))) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(blub)) %>%
mutate(device = as.character(device))
output[[i, 1]] <- list_other_stopwords[i]
output[[i, 2]] <- temp$n[1]
output[[i, 3]] <- temp$n[2]
output[[i, 4]] <- round(temp$n[2] * comparability_factor[2], 0)
}
rm(l, buffer, i, temp)
# RECALCULATING FOR I BECAUSE EVERYTHING HAD BEEN LOWERCASED
# Tweets without lowercasing
buffer <- train_utf8_no_urls_mentions_hashtags$text
# Index for I among other stopwords
index <- str_detect(list_other_stopwords, "^i$")
index <- which(index == TRUE)
# Data frame with occurrence number
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(text = buffer) %>%
mutate(blub = str_count(text,
"^I\\s|^I[^\\.\\'][:punct:]|\\sI\\s|\\sI[^\\.\\'][:punct:]|[^\\.\\'][:punct:]I\\s|[^\\.\\'][:punct:]I[^\\.\\'][:punct:]|\\sI\\$|[^\\.\\'][:punct:]I\\$")) %>%
select(- text) %>%
mutate(device = as.factor(device)) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = sum(blub)) %>%
mutate(device = as.character(device))
output[[index, 1]] <- "I"
output[[index, 2]] <- temp$n[1]
output[[index, 3]] <- temp$n[2]
output[[index, 4]] <- round(temp$n[2] * comparability_factor[2], 0)
rm(index, temp)
# Naming columns. Column names will be lodged in variables
# for coding convenience below.
nc1 <- "Other Stopword - More flexible look-around"
nc2 <- "Android Occurrences"
nc3 <- "iPhone Occurrences"
nc4 <- "iPhone Number Sample-adjusted"
output <- output %>%
`colnames<-`(c(nc1, nc2, nc3, nc4))
# Sorting on column 2.
output <- output %>% arrange(desc(.[2]))
# Creating an interactive data table, using the DT package.
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# Prints the interactive datatable.
datatable(output, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(output) - 1))),
initComplete = JS(initComplete))) %>%
formatStyle(nc1, color = white, backgroundColor = deep_blue) %>%
formatStyle(nc2, color = white, backgroundColor = bluish_green) %>%
formatStyle(nc3, color = white, backgroundColor = gray) %>%
formatStyle(nc4, color = white, backgroundColor = gray_palette) When comparing the two tables just above, several statements can be made.
For many stopwords, applying more flexible look-around code when identifying stopwords leads to a few additional occurrences, as readers can easily state by comparing both interactive tables.
In some cases, these additional occurrences make the occurrence breakdown by device a little bit more uneven. Just a few examples:
In other cases, which the readers can easily spot, the occurrence breakdown by device becomes somewhat less uneven. Just one example: no goes from 72/40 to 106/65. This is also an exception in terms of additional occurrences: there is indeed some solid increase between 72/40 and 106/65.
The stopword am has a peculiar evolution: it really jumps from 92/39 to much higher levels of 135/202, inverting the occurrence breakdown by device in favor of the iPhone! Among the new occurrences, there can be some occurrences of time format, etc. Anyway, this will be further investigated when building up prospective predictors.
Up to now, Exploratory Data Analysis has been about emojis, punctuation marks — including some symbols —, enclosed punctuation marks, special sequences, abbreviations, short forms and other stopwords. Let’s move towards entities — URLs, mentions, and hashtags.
The next table gives some global statistics about URLs.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(output, graph_title, x_title, y_title, angle, name1, name2)
# Table: statistics about URLs
tab <-
data.frame(a = 1, b = 2) %>%
mutate(a = sum(urls_count), b = length(urls)) %>%
mutate(a = format(a, big.mark = " "),
b = format(b, big.mark = " ")) %>%
`colnames<-`(c("Number of Occurrences of URLs",
"Number of Unique URLs"))
# Prints table with layout bg-primary.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Number of Occurrences of URLs | Number of Unique URLs |
|---|---|
| 1 229 | 1 202 |
There are few repetitions of URLs. Let’s visualize which URLs are repeated and how often in the next table, which shows the number of occurrences of each URL by device.
# Data frame with URLs used by the Android device as a list of lists
v_a <-
train_utf8 %>%
select(device, text) %>%
filter(device == "Android") %>%
mutate(urls = str_extract_all(text, urls_as_pattern)) %>%
select(urls)
# Table with occurrence numbers in Android tweets
tab_a <-
data.frame(urls = unlist(v_a)) %>%
mutate(urls = as.factor(urls)) %>%
group_by(urls, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(urls = as.character(urls))
rm(v_a)
# Data frame with URLs used by the iPhone as a list of lists
v_i <- train_utf8 %>%
select(device, text) %>%
filter(device == "iPhone") %>%
mutate(urls = str_extract_all(text, urls_as_pattern)) %>%
select(urls)
# Table with occurrence numbers in iPhone tweets
tab_i <- data.frame(urls = unlist(v_i)) %>%
mutate(urls = as.factor(urls)) %>%
group_by(urls, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(urls = as.character(urls))
rm(v_i)
# Joins both tables.
j <- full_join(tab_a, tab_i, by = "urls")
j <- j %>%
filter(n.x >= 1 | n.y >= 1) %>%
arrange(desc(n.y))
rm(tab_a, tab_i)
# Naming columns. Column names will be lodged in
# variables for coding convenience in the datatable below.
nc1 <- "URL"
nc2 <- "Android Occurrences"
nc3 <- "iPhone Occurrences"
names(j) <- c(nc1, nc2, nc3)
# Creates an interactive data table, using the DT package.
# This JavaScript extension colors/background colors the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# Prints the interactive datatable.
datatable(j, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(j) - 1))),
initComplete = JS(initComplete))) %>%
formatStyle(nc1, color = white, backgroundColor = deep_blue) %>%
formatStyle(nc2, color = white, backgroundColor = bluish_green) %>%
formatStyle(nc3, color = white, backgroundColor = gray_palette) Repeating URLs has only been done by the iPhone. This can have some predictive value.
The next table shows the number of Android and iPhone tweets with no URL, with one URL, with two URLs, and with more than two URLs.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab, j, nc1, nc2, nc3, initComplete)
# Data frame with occurrence numbers of URLs
temp <- train_utf8_no_urls_mentions_hashtags %>%
select(device) %>%
mutate(device = as.factor(device)) %>%
mutate(urls_count = urls_count)
# Counts the number of tweets that contain no URL.
tab0 <- temp %>%
filter(urls_count == 0) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts the number of tweets with 1 URL.
tab1 <- temp %>%
filter(urls_count == 1) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts the number of tweets with 2 URLs.
tab2 <- temp %>%
filter(urls_count == 2) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts the number of tweets with 3 or more URLs.
tab3 <- temp %>%
filter(urls_count >= 3) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
rm(temp)
# Assembles the 4 sub-tables into a global one and
# converts tab from wide format to tidy format for ggplot2.
tab <- cbind(tab0, tab1$n, tab2$n, tab3$n) %>%
`colnames<-`(c("device", "0", "1", "2", ">= 3")) %>%
gather(key = punct, value = n, `0`:`>= 3`) %>%
select(punct, device, n)
rm(tab0, tab1, tab2, tab3)
graph_title <- "Dispersion of URLs"
x_title <- "Number of URL Occurrences in a Tweet"
y_title <- "Number of Tweets"
angle <- 0
name1 <- "Number of URL Occurrences in a Tweet"
name2 <- "Number of Tweets"
graphic_function(tab, graph_title, x_title, y_title,
angle, name1, name2)Among tweets without URL, there is predominance of the Android device with 74 % of tweets. On the contrary, among tweets with one or more URLs, there is strong predominance of the iPhone. This is valuable predictive information.
The next table shows some general statistics about mentions in training set tweets.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab, graph_title, x_title, y_title, angle, name1, name2)
# Table: statistics about mention occurrence
tab <- data.frame(a = 1, b = 2) %>%
mutate(a = sum(mentions_count),
b = length(mentions)) %>%
`colnames<-`(c("Number of Mention Occurrences",
"Number of Unique Mentions"))
# Prints table.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Number of Mention Occurrences | Number of Unique Mentions |
|---|---|
| 926 | 305 |
The table above shows that there are many repetitions. This can be further investigated with the next table, which gives, for each mention, the number of occurrences in Android tweets and in iPhone tweets.
# Data frame with mentions used by the Android device as list of lists
v_a <- train_utf8_no_urls %>%
select(device, text) %>%
filter(device == "Android") %>%
mutate(v = str_extract_all(text, mentions_as_pattern)) %>%
.$v
# Data frame which gives, for each mention used by the
# Android device, the number of occurrences in Android tweets.
tab_a <- data.frame(mentions = unlist(v_a)) %>%
mutate(mentions = as.factor(mentions)) %>%
group_by(mentions, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(mentions = as.character(mentions))
rm(v_a)
# Data frame with mentions used by the iPhone as list of lists
v_i <- train_utf8_no_urls %>%
select(device, text) %>%
filter(device == "iPhone") %>%
mutate(v = str_extract_all(text, mentions_as_pattern)) %>%
.$v
# Data frame which gives, for each mention used by the iPhone,
# the number of occurrences in iPhone tweets.
tab_i <- data.frame(mentions = unlist(v_i)) %>%
mutate(mentions = as.factor(mentions)) %>%
group_by(mentions, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(mentions = as.character(mentions))
rm(buffer, v_i)
# Assembles both tables into a global one.
j <- full_join(tab_a, tab_i, by = "mentions") %>%
arrange(desc(n.x))
rm(tab_a, tab_i)
# Naming columns. Column names will be lodged in
# variables for coding convenience in the datatable below.
nc1 <- "Mentions"
nc2 <- "Android Occurrences"
nc3 <- "iPhone Occurrences"
names(j) <- c(nc1, nc2, nc3)
# Creates an interactive data table, using the DT package.
# This JavaScript extension colors/background colors the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# Prints the interactive datatable.
datatable(j, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(j) - 1))),
initComplete = JS(initComplete))) %>%
formatStyle(nc1, color = white, backgroundColor = deep_blue) %>%
formatStyle(nc2, color = white, backgroundColor = bluish_green) %>%
formatStyle(nc3, color = white, backgroundColor = gray_palette) For the first mentions in the table above, there are rather substantial numbers of occurrences and there is predominance of the Android device. This looks like valuable predictive information.
Capitalization can matter. Some mentions are written in different ways, once with all or some capitalization and once with less or no capitalization. This is the case for @Mike_Pence and @mike_pence. For @Mike_Pence, the iPhone has 8 occurrences and the Android device none; for @mike_pence, the numbers of occurrences are 4 for the iPhone and 2 for the Android device. In other words, the Android device never capitalizes Mike Pence’s name. This might have some additional predictive value.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(j, nc1, nc2, nc3, initComplete)
# Data frame with occurrence counts of mentions
temp <- train_utf8_no_urls %>%
select(device) %>%
mutate(device = as.factor(device)) %>%
mutate(mentions_count = mentions_count)
# Counts number of tweets that contain no mention.
tab0 <- temp %>%
filter(mentions_count == 0) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts number of tweets with 1 mention.
tab1 <- temp %>%
filter(mentions_count == 1) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts number of tweets with 2 mentions.
tab2 <- temp %>%
filter(mentions_count == 2) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
# Counts number of tweets with 3 or more mentions.
tab3 <- temp %>%
filter(mentions_count >= 3) %>%
group_by(device, .drop = FALSE) %>%
summarise(n = n()) %>%
mutate(device = as.character(device))
rm(temp)
# Assembles the 4 sub-tables into a global one and
# converts tab from wide format to tidy format for ggplot2.
tab <- cbind(tab0, tab1$n, tab2$n, tab3$n) %>%
`colnames<-`(c("device", "0", "1", "2", ">= 3")) %>%
gather(key = punct, value = n, `0`:`>= 3`) %>%
select(punct, device, n)
rm(tab0, tab1, tab2, tab3)
# Parameter values for user-defined graphic_function()
graph_title <- "Dispersion of Mentions"
x_title <- "Mention Occurrences in a Tweet"
y_title <- "Number of Tweets"
angle <- 0
name1 <- "Mention Occurrences in a Tweet"
name2 <- "Number of Tweets"
# Running function to produce graph.
graphic_function(tab, graph_title, x_title, y_title,
angle, name1, name2)The picture is completely different from the URL picture. Here, the Android device predominates in tweets with one or more mention occurrences, with predominance being somewhat stronger — in proportion not in absolute numbers — in tweets with two mention occurrences. This might have some predictive value.
The Exploratory Data Analysis has pinpointed numerous tweet components that look like interesting predictive information when their occurrence is counted by tweet and by device. As examples with very uneven breakdown between device, let’s remember single quotation marks, the newline character — —, hyphens left enclosed — following a letter and followed by an empty space character —, and right parentheses nested between non-empty space characters.
Sometimes, occurrences were counted by tweet and by device for groups of text components. Let’s remember very uneven occurrence breakdowns by device for curly quotes/apostrophes, State abbreviations, URLs, and hashtags.
Most text components or groups of text components will be taken into consideration. Definitions can be changed with, for instance, alternative look-around coding. What will be looked for is uneven occurrence breakdown by device, preferably with numerous occurrences. Some less populated prospective predictors can nevertheless be kept if their usage is thought to be typical of one device. Overlapping of prospective predictors will be avoided.
Selection has often required visualization tools just as those used in Exploratory Data Analysis. For reasons of brevity, the visualization process is no longer shown. Anyway, absolutely all code is included in the Rmd file Code.Rmd and in the HTML file index.html, where it is accessible by pushing the tags on the right-hand side.
Selecting a prospective predictor on its own merits in relation with the target variable is only part of the process. Actually, its predictive efficacy depends also on its interaction with other prospective predictors. When the set of predictors is ready, its global predictive efficacy will be tested using a Machine Learning model on the training set. The metric chosen — accuracy — can help evaluate the predictive power of the set of prospective predictors. An analysis of false negatives and false positives can also indicate whether there is some leeway for making the set of prospective predictors more impactful.
The prospective predictors will be dealt in the same order as in the Exploratory Data Analysis and will be assembled by blocks: - emojis, - punctuation, - special sequences, - abbreviations, - short forms and other stopwords, - entities (URLs, mentions, hashtags).
This stepwise process will often require some retroaction to be applied to prospective predictors already dealt with.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(tab, graph_title, x_title, y_title, angle, name1, name2,
j, nc1, nc2, nc3, initComplete)
emoji_predictor <- ji_count(train_utf8$text)
# This predictor will join the punctuation block of predictors later on.First, single punctuation marks, whose list is in the code chunk just below.
single_punctuation_marks <-
c(".", "?", "!", ",", ":", "/", "+", "=", "$",
"_", "-", "–", "—", "(", ")", '"', "'")
column_names_single_punctuation <-
c("dot", "quest_mark", "ex_mark", "comma",
"colon", "slash", "plus", "equal", "dollar",
"underscore", "hyphen", "endash", "emdash",
"left_par", "right_par",
"doub_quot", "apostrophe_single_quote")
# Vector of multiple punctuation marks that will be first
# discarded before counting single punctuation marks.
to_be_discarded <-
c("\\.", "\\?", "!", ",", ":", "/", "\\+", "=", "\\$",
"_", "-", "–", "—", "\\(", "\\)", '\\"', "\\'")
to_be_discarded <-
paste(to_be_discarded, "{2,}", "|", sep = "", collapse = "")
# Are discarded as well two special sequences: numbers followed
# by a percent sign and & both will be treated
# with other special sequences below.
to_be_discarded <-
paste(to_be_discarded, "\\d+%|&", sep = "", collapse = "")
# Discards multiple punctuation marks from tweets
# just for counting single punctuation mark occurrences.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
# Constructing a data frame as receptacle for single punctuation marks.
l <- length(single_punctuation_marks)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_punctuation <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_single_punctuation)
# Counting occurrences of single punctuation marks.
for (i in 1:length(single_punctuation_marks)) {
block_punctuation[, i] <-
str_count(buffer, fixed(single_punctuation_marks[i]))
}
rm(single_punctuation_marks, column_names_single_punctuation,
to_be_discarded, buffer, l, n)
# Adding the emoji predictor.
block_punctuation$emojis_count <- emoji_predictor
rm(emoji_predictor)Then predictors from repetitive punctuation. This list is in the code chunk below.
column_names_repetitive_punctuation <-
c("doub_hyphen", "trip_hyphen", "ellipsis", "quad_dot")
# Constructing a data frame as receptacle for prospective predictors.
l <- length(column_names_repetitive_punctuation)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_repetitive_punctuation <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_repetitive_punctuation)
# DOUBLE HYPHEN
# Discards triple (or more) hyphenS to count double hyphens.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "-{3,}", "")
# Counting occurrences of double hyphen.
block_repetitive_punctuation[, 1] <- str_count(buffer, "--")
# TRIPLE HYPHEN AND ELLIPSIS
# Discards quadruple hyphens/dots to count triple hyphens and ellipses.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "-{4,}|\\.{4,}", "")
# Counting occurrences of triple hyphen.
block_repetitive_punctuation[, 2] <- str_count(buffer, "---")
# Counting occurrences of ellipses.
block_repetitive_punctuation[, 3] <- str_count(buffer, "\\.{3}")
# QUADRUPLE DOT
# Discards quintuple dot for counting quadruple dot.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "\\.{5,}", "")
# Counting occurrences of quadruple dot.
block_repetitive_punctuation[, 4] <- str_count(buffer, "----")
# Joining with existing block_punctuation.
block_punctuation <-
cbind(block_punctuation, block_repetitive_punctuation)
rm(column_names_repetitive_punctuation,
block_repetitive_punctuation, buffer, l, n)Then single quotes and apostrophes.
As already explained at length in the Exploratory Data Analysis, the mark ’ can be a single quotation mark — as often seen in iPhone tweets and never in Android tweets — or an apostrophe. The occurrences of single quotation marks will be counted for each tweet, kept in a separate prospective predictor, and subtracted from the count of all occurrences of the mark ’, delivering at the same time a prospective predictor with the occurrence count of apostrophes.
This is done after tweet conversion to UTF-8, which replaces all curly apostrophe/single quote with straight ones. Curly apostrophes and quotes will be treated separately later one.
# Single quotes will be counted first.
# Tweets will be lowercased since short forms need identifying
# and short forms are already lowercased in the package stopwords.
text <- train_utf8_no_urls_mentions_hashtags$text
text <- str_to_lower(text, locale = "en")
# Let's remove apostrophes from
# - the possessive forms 's,
# - plural forms such as $'s,
# - enclosed apostrophes in short forms or
# - in e.g. some family names like O'Reilly,
# - apostrophes in abbreviated two-digit format for years,
# - trailing apostrophes at the end of two colloquialisms,
# i.e. "ya'" and "lyin'", used by the Android device,
# - apostrophes in #'s and $'s .
# By the way, we do not eliminate apostrophes from the possessive
# forms s' because at the same time we would eliminate quotes.
# We'll tackle that later on, in a completely different way.
text <- str_replace_all(text, "([^\\s])(\\')(s)", "\\1\\3")
text <- str_replace_all(text,
"([A-Za-z])(\\')([A-Za-z])", "\\1\\3")
text <- str_replace_all(text, "(\\')(\\d{2})", "\\2")
text <- str_replace_all(text, "(ya)(\\')", "\\1")
text <- str_replace_all(text, "(lyin)(\\')", "\\1")
text <- str_replace_all(text, "(#)(\\')(s)", "\\1\\3")
text <- str_replace_all(text, "(\\$)(\\')(s)", "\\1\\3")
# There can still remain apostrophes, probably in limited number.
# The number of single quotes should be an even number. If there
# remains 1 apostrophe among them in a tweet, then the number
# of presumed single quotes is an odd number: must be decreased by 1.
# Of course, this wouldn't solve the problem if there were 2 or 3
# apostrophes left in one tweet; this is not highly probable and
# has already been checked as not happening in the training set.
single_quote <- str_count(text, "\\'")
for (i in 1:nrow(train_utf8_no_urls_mentions_hashtags)) {
# If the number of ' in a tweet is an odd number ...
if ((single_quote[i] %% 2) > 0) {
# ... then the number of presumed single quotes is reduced by 1.
single_quote[[i]] <- single_quote[i] - 1
}
}
# single_quot will join the prospective predictors from
# block_punctuation and will be deducted from the prospective
# predictor "apostrophe_single_quote" to give "apostrophe".
block_punctuation$single_quote <- single_quote
block_punctuation$apostrophe <-
block_punctuation$apostrophe_single_quote - single_quote
block_punctuation$apostrophe_single_quote <- NULL
rm(text, i, single_quote)Last, among single punctuation marks, curly ones: curly single quotes, curly double quotes, curly apostrophes. The total of curly quotes and apostrophes in each tweet will be a potential predictor.
Of course, counting will be done on a version of tweets without standardization — no UTF-8 conversion which replaces curly apostrophes/singles quotes with straight ones and no replacement of curly double quotes with straight ones.
Five frequently enclosed punctuation marks have been selected as prospective predictors: dots, hyphens, colons, commas, and slashes.
For each of these five punctuation marks, a prospective predictor will be built up with the occurrence count of enclosed single punctuation in each tweet.
For hyphen, colon, comma, and slash, the prospective predictor based on enclosed punctuation will also be deducted from the general prospective predictor for the same punctuation mark. For dots, reshuffling will wait until additional information is gained — such as special sequences, abbreviations, etc.
single_enclosed_punctuation <-
c("\\.", "-", ":", ",", "/")
column_names_single_enclosed_punctuation <-
c("dot_enc", "enc_hyphen", "enc_colon", "enc_comma",
"enc_slash")
# Constructing a receptacle data frame for occurrence counts.
l <- length(single_enclosed_punctuation)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_single_enclosed_punctuation <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_single_enclosed_punctuation)
# Counting occurrences of top enclosed punctuation marks.
for (i in 1:length(single_enclosed_punctuation)) {
# The following pattern counts enclosed single punctuation
# marks but no enclosed repetitions of punctuation marks.
pattern <-
paste("(?![:punct:])\\S", single_enclosed_punctuation[i],
"(?![:punct:])\\S", sep = "")
block_single_enclosed_punctuation[, i] <-
str_count(train_utf8_no_urls_mentions_hashtags$text,
pattern)
}
# Adding prospective predictors from enclosed punctuation
# to existing block of punctuation.
block_punctuation <-
cbind(block_punctuation, block_single_enclosed_punctuation)
# Deducting occurrence numbers of enclosed single punctuation
# from the total occurrence numbers of the same punctuation mark.
block_punctuation$comma <-
block_punctuation$comma - block_punctuation$enc_comma
block_punctuation$hyphen <-
block_punctuation$hyphen - block_punctuation$enc_hyphen
block_punctuation$colon <-
block_punctuation$colon - block_punctuation$enc_colon
block_punctuation$slash <-
block_punctuation$slash - block_punctuation$enc_slash
# Prospective predictors related to dots will be adjusted later on
# with additional information.
rm(single_enclosed_punctuation,
column_names_single_enclosed_punctuation,
block_single_enclosed_punctuation, l, n, i, pattern)Most special sequences from EDA are taken over. The list is included in the chunk below.
Some reshuffling is done: some specialized prospective predictors are subtracted from more general ones, producing more uneven occurrence breakdowns by device — and in that sense more precise information — and avoiding overlapping.
Some previous prospective predictors are replaced by new ones, always with the idea of producing more uneven occurrence breakdowns by device and avoiding overlapping.
All code is included in the chunk below, fully commented upon.
special_sequences <-
c("\\\n",
"\\&",
"\\S\\)\\S",
"[^[:punct:]]![^[:punct:]]",
"[a-zA-Z]-\\s",
"\\s-\\s",
"[a-zA-Z]-[a-zA-Z]",
"[a-z]\\.\\s",
"^\\.",
"!$",
"\\?$",
"\\.$",
"[:alnum:]$",
"[\\s\\$]\\d{1,3},\\d{3},\\d{3}[\\s[:punct:][a-zA-Z]][^\\d]",
"\\d+\\.\\d+",
"\\d+[^\\.%]",
"\\sA.M.|\\sP.M.",
"\\d{1,2}\\:\\d{2}(am|pm)",
"[^\\d]\\d{1,2}/\\d{1,2}/(\\d{2}){1,2}[\\s[a-zA-Z][:punct:]]",
"\\'\\d{2}[\\s[:punct:]]",
"[\\s[:punct:]]U.S.[\\s[:punct:]]",
"U.S.A.",
"Lyin\\'|lyin\\'|LYIN\\'",
"Havn\\'t|havn\\'t|HAVN\\'T",
"^#1[^\\w]|#1[^\\w]|#1$")
column_names_special_sequences <-
c("newline",
"ampersand",
"spec_enc_right_par",
"ex_mar_wo_punct",
"semi_enc_hyphen",
"free_hyphen",
"compounded_hyphen",
"period",
"dot_upfront",
"tweet_end_ex_mark",
"tweet_end_quest_mark",
"dot_tweet_end",
"tweet_end_alphanum",
"thousand_sep_2",
"dot_decimal_part",
"percentual_number",
"A.M._or_P.M.",
"time_num_colon_ampm",
"date_num_slashes",
"abb_millenial",
"U.S.",
"U.S.A.",
"lyin_apostrophe",
"havnt_apostrophe",
"number_one")
# Constructing a receptacle data frame for prospective predictors.
l <- length(special_sequences)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_special_sequences <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_special_sequences)
# Counting occurrences of each special sequence in each tweet.
for (i in 1:length(special_sequences)) {
block_special_sequences[, i] <-
str_count(train_utf8_no_urls_mentions_hashtags$text,
special_sequences[i])
}
# Adding prospective predictors based on special sequences
# to existing block_punctuation.
block_punctuation <-
cbind(block_punctuation, block_special_sequences)
# Special sequence prospective predictors will be subtracted
# from more general ones.
block_punctuation$quest_mark <-
block_punctuation$quest_mark -
block_punctuation$tweet_end_quest_mark
block_punctuation$apostrophe <-
block_punctuation$apostrophe -
block_punctuation$abb_millenial
# From the prospective predictor right_par, we will subtract the
# prospective predictor spec_enc_right_par, which is specially enclosed
# on both sides in the sense that the surrounding characters can be
# any non-empty character including other punctuation marks, which
# has not been done for other enclosed single punctuation marks.
block_punctuation$right_par <-
block_punctuation$right_par -
block_punctuation$spec_enc_right_par
# The prospective predictor left_par, which contains the same
# information as right_par, will be discarded.
block_punctuation$left_par <- NULL
# Special sequences will be subtracted from enclosed ones.
block_punctuation$enc_comma <-
block_punctuation$enc_comma -
block_punctuation$thousand_sep_2
block_punctuation$enc_colon <-
block_punctuation$enc_colon -
block_punctuation$time_num_colon_ampm
block_punctuation$enc_slash <-
block_punctuation$enc_slash -
(2 * block_punctuation$date_num_slashes)
# Some prospective predictors will be deleted because some special
# sequences provide more uneven occurrence breakdowns by device.
# First, the prospective predictors dot and dot_enc
# are replaced by the prospective predictors period, dot_upfront,
# dot_tweet_end, A.M._or_P.M., U.S., U.S.A., and dot_decimal_part.
block_punctuation$dot <- NULL
block_punctuation$dot_enc <- NULL
# Second, the prospective predictors hyphen and enc_hyphen are
# replaced by the predictors semi_enc_hyphen, free_hyphen,
# and compounded_hyphen.
block_punctuation$hyphen <- NULL
block_punctuation$enc_hyphen <- NULL
# Third, the prospective predictors ex_mark and enc_ex_mark are
# replaced by the prospective predictors ex_mar_wo_punct and
# tweet_end_ex_mark.
block_punctuation$ex_mark <- NULL
block_punctuation$enc_ex_mark <- NULL
rm(special_sequences, column_names_special_sequences,
block_special_sequences, l, n, i)First, there will a prospective predictor based on occurrences of State abbreviations without dots.
# List of abbreviations of American States without dots
state_abb_without_dots <- state.abb
# Replacing State abbreviations with placeholder.
buffer <- train_utf8_no_urls_mentions_hashtags$text
for (i in 1:length(state_abb_without_dots)) {
# If State abbreviation is in front of tweet.
buffer <-
str_replace_all(buffer,
paste("(^", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"ABBPLACEHOLDER\\2")
# If State abbreviation is in the "middle" of tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
# If State abbreviation is at the end of tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
"$)", sep = ""),
"\\1ABBPLACEHOLDER")
# 2nd iteration for a State abbreviation in the "middle"
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
}
# Counting occurrences of State abbreviations without dots.
State_abb <- str_count(buffer, "ABBPLACEHOLDER")
# Adding count to block_punctuation.
block_punctuation$State_abb <- State_abb
rm(state_abb_without_dots, buffer, i, State_abb)Second, there will a prospective predictor built up on the basis of abbreviations with dots. Occurrences of dots will be counted instead of occurrences of abbreviations.
Another prospective predictor — time format abbreviations with dots — will be transformed in the same way to allow reconciliation with other statistics about dots.
There will be a full reshuffle of all prospective predictors based on dots:
The whole code, amply commented upon, is incorporated in the code chunk below.
# Extracting all abbreviations uppercased and with dots.
# The Regex pattern looks for abbreviations at the beginning
# of tweets or elsewhere in tweets.
blub <-
str_extract_all(train_utf8_no_urls_mentions_hashtags$text,
"^([:upper:]\\.){2,}|[^[:alpha:]]([:upper:]\\.){2,}")
# Unlisting and collapsing each row in order to get a vector
# of abbreviations for each row.
for (i in 1:length(blub)) {
blub[[i]] <- paste(unlist(blub[i]), sep = "", collapse = " ")
}
# Eliminating a possible dot in front of an abbreviation.
blub <- str_replace_all(blub, "(\\s)(\\.)([:upper:])", "\\1,\\3")
# Counts the number of dots in abbreviations in each row.
block_punctuation$dot_abb <- str_count(blub, "\\.")
# Expressing the prospective predictor block_punctuation$A.M._or_P.M.
# in occurrences of dots instead of occurrences of sequences
# in order to subtract it from block_punctuation$dot_abb.
# block_punctuation$A.M._or_P.M. is kept because of fully uneven
# occurrence breakdown by device with zero cases for the iPhone.
block_punctuation$A.M._or_P.M. <- 2 * block_punctuation$A.M._or_P.M.
block_punctuation$dot_abb <-
block_punctuation$dot_abb - block_punctuation$A.M._or_P.M.
rm(blub, i)
# In order to avoid double counting with other dot prospective predictors,
# abbreviations will be neutralized by being replaced by placeholders.
# First, neutralizing abbreviations with dots in the "middle" or
# at the end of tweets. Abbreviations are replaced by a placeholder
# followed by a comma in order to keep a punctuation mark and preserve
# preexisting relationships with surrounding characters.
buffer <-
str_replace_all(train_utf8_no_urls_mentions_hashtags$text,
"([^[:alpha:]])(([:upper:]\\.){2,})",
"\\1ABBPLACEHOLDER,")
# Second, neutralizing abbreviations with dots in front of tweets.
buffer <- str_replace_all(buffer,
"^([:upper:]\\.){2,}",
"ABBPLACEHOLDER,")
# This buffer is a starting point to rationalize the whole set of
# prospective predictors related to dots.
# Some previous prospective predictors will be dropped because they
# have been implicitly regrouped in dot_abb .
block_punctuation$U.S. <- NULL
block_punctuation$U.S.A. <- NULL
# Registers repetitions of dots and discards them from buffer.
block_punctuation$dot_three_plus <- str_count(buffer, "(\\.){3,}")
buffer <- str_replace_all(buffer, "(\\.){3,}", ",")
buffer <- str_replace_all(buffer, "(\\.){2}", ",")
# Removes block_punctuation$quad_dot which has been implicitly
# incorporated into block_punctuation$dot_three_plus.
block_punctuation$quad_dot <- NULL
# The rest from this code chunk mainly spots specific subgroups of
# enclosed dots with uneven occurrence breakdown by device.
# Dots enclosed between alphabetic characters
block_punctuation$dot_enc_alpha <-
str_count(buffer, "[:alpha:]\\.[:alpha:]")
buffer <- str_replace_all(buffer,
"([:alpha:])(\\.)([:alpha:])",
"\\1,\\3")
block_punctuation$dot_enc <- NULL
# Dots enclosed between digits.
block_punctuation$dot_decimal_part <- str_count(buffer, "\\d\\.\\d")
buffer <- str_replace_all(buffer, "(\\d)(\\.)(\\d)", "\\1,\\3")
# Dots enclosed between letter and double quotation marks.
block_punctuation$dot_enc_alpha_doub_quot <-
str_count(buffer, "[:alpha:]\\.\"")
buffer <- str_replace_all(buffer, "([:alpha:])(\\.)(\")", "\\1,\\3")
# Dots enclosed in other combinations are just removed.
buffer <- str_replace_all(buffer, "(\\S)(\\.)(\\S)", "\\1,\\3")
# Dots left enclosed between hashtags and empty space.
block_punctuation$dot_left_enc_hashtag <-
str_count(buffer, "HASHTAGPLACEHOLDER\\.")
buffer <- str_replace_all(buffer,
"HASHTAGPLACEHOLDER\\.",
"HASHTAGPLACEHOLDER,")
# Dots left enclosed between mentions and empty space.
block_punctuation$dot_left_enc_mention <-
str_count(buffer, "MENTIONPLACEHOLDER\\.")
buffer <- str_replace_all(buffer,
"MENTIONPLACEHOLDER\\.",
"MENTIONPLACEHOLDER,")
# Dots left enclosed between URLs and empty space are just removed.
buffer <- str_replace_all(buffer,
"URLPLACEHOLDER\\.",
"URLPLACEHOLDER,")
# Recalculates number of dots at the end of tweets.
block_punctuation$dot_tweet_end <- str_count(buffer, "\\.$")
buffer <- str_replace_all(buffer, "\\.$", ",")
# Recalculates number of dots at the beginning of tweets.
block_punctuation$dot_upfront <- str_count(buffer, "^\\.")
buffer <- str_replace_all(buffer, "^\\.", ",")
# Dots left enclosed between upper letter and empty space.
block_punctuation$dot_left_enc_upper <-
str_count(buffer, "[:upper:]\\.\\s")
# Dots left enclosed between lower letter or digit and empty space.
block_punctuation$dot_left_enc_lower_digit <-
str_count(buffer, "[[:lower:]\\d]\\.\\s")
# Dots left enclosed between punctuation and empty space.
block_punctuation$dot_left_enc_punct <-
str_count(buffer, "[:punct:]\\.\\s")
# The remaining dots are not incorporated in any prospective predictor.
# Previous prospective predictor is out because of redundancy.
block_punctuation$dot <- NULLThere will be prospective predictors for all short forms with at least 10 occurrences in training set tweets.
# Short forms will be added as prospective predictors if and only if
# the number of occurrences in training set tweets is at least 10.
# In order to facilitate text mining, let's lowercase
# the tweets since short forms are already lowercased.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for occurrence totals for each short form
l <- length(list_short_forms)
output <-
data.frame(matrix(l * 2, nrow = l, ncol = 2) *1) %>%
`colnames<-`(c("short_form", "n"))
# For loop for counting totals.
for (i in 1:l) {
df <-
data.frame(buffer) %>%
mutate(n = str_count(buffer, list_short_forms[i])) %>%
summarise(n = sum(n))
output$short_form[[i]] <- list_short_forms[i]
output$n[[i]] <- df$n
}
# Identifying short forms with at least 10 occurrences.
output <- output %>% filter(n >= 10)
top_short_forms <- output$short_form
column_names_top_short_forms <-
str_replace_all(top_short_forms, "\\'", "_")
# Receptacle data frame for top short form prospective predictors
l <- length(top_short_forms)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_top_short_forms <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_top_short_forms)
# Counting occurrences of top short forms for each tweet.
for (i in 1:length(top_short_forms)) {
block_top_short_forms[, i] <-
str_count(train_utf8_no_urls_mentions_hashtags$text,
top_short_forms[i])
}
rm(buffer, output, column_names_top_short_forms, df, l, n, i)There will be prospective predictors for all stopwords other than short forms with at least 10 occurrences in training set tweets.
# Other stopwords will be added as prospective predictors if and
# only if the number of occurrences in all tweets is at least 10.
# In order to facilitate text mining, let's lowercase
# the tweets since short forms are already lowercased.
buffer <- train_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Receptacle data frame for occurrence totals for each other stopword
l <- length(list_other_stopwords)
output <-
data.frame(matrix(l * 2, nrow = l, ncol = 2) *1) %>%
`colnames<-`(c("other_stopword", "n"))
# Counting totals.
for (i in 1:l) {
df <-
data.frame(buffer) %>%
mutate(n =
str_count(buffer,
paste("^", list_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]-]", list_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]-]", list_other_stopwords[i], "\\$",
sep = ""))) %>%
summarise(n = sum(n))
output$other_stopword[[i]] <- list_other_stopwords[i]
output$n[[i]] <- df$n
}
# Identifying stopwords other than short forms
# with at least 10 occurrences in all tweets taken together.
output <- output %>% filter(n >= 10)
top_other_stopwords <- output$other_stopword
column_names_top_other_stopwords <-
str_replace_all(top_other_stopwords, "\\'", "_")
# Receptacle data frame for top other stopword prospective predictors
l <- length(top_other_stopwords)
n <- nrow(train_utf8_no_urls_mentions_hashtags)
block_top_other_stopwords <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_top_other_stopwords)
# Counting occurrences of each top other stopword in each tweet.
for (i in 1:length(top_other_stopwords)) {
block_top_other_stopwords[, i] <-
str_count(buffer,
paste("^", top_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]]", top_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]]", top_other_stopwords[i], "\\$",
sep = ""))
}
rm(column_names_top_other_stopwords, output, l, n, i, df)
# Building up one single block for all stopwords.
block_stopwords <-
cbind(block_top_short_forms, block_top_other_stopwords)
rm(block_top_short_forms, block_top_other_stopwords)2 paths will be followed:
Why 3 occurrences? Because multi-URL tweets seems to be a characteristic of iPhone tweets, no Android tweet containing more than one URL.
# Keeping just tweets.
buffer <- train_utf8$text
# First prospective predictor: total number of URLs in each tweet
# already exists in urls_count.
# Counting the occurrences in all tweets together for each URL.
output <- data.frame(url = urls, n = 1:length(urls))
for (i in 1:length(urls)) {
output$n[[i]] <- sum(str_count(buffer, urls[i]))
}
# Keeping URLs with at least 3 occurrences.
output <- output %>% filter(n >=3)
# Making prospective predictors out of them, with, for each,
# the number of occurrences in each tweet.
l <- length(buffer)
n <- length(output$n)
block_urls <-
data.frame(matrix(l * n, nrow = l, ncol = n) * 1) %>%
`colnames<-`(output$url)
for (i in 1:length(output$n)) {
block_urls[, i] <- str_count(buffer, urls[i])
}
# Adding the general total urls_count to the block.
block_urls <- cbind(block_urls, urls_count)
# Keeping for later use.
top_urls <- output$url
rm(buffer, output, i, l, n)2 paths will be followed:
# Keeping just tweets.
buffer <- train_utf8_no_urls$text
# The first prospective predictor already exists: it is the number of
# mention occurrences in each tweet, which is recorded in mentions_count.
# Other prospective predictors from mentions having at least
# 10 occurrences will be added.
# Length of existing mention vector.
l <- length(mentions)
# Receptacle data frame with number of occurrences for each mention
# in all tweets together, to pick up the top mentions.
output <- data.frame(mention = mentions, n = 1:l)
for (i in 1:l) {
output$n[[i]] <- sum(str_count(buffer, mentions[i]))
}
# Keeping mentions with at least 10 occurrences.
output <- output %>% filter(n >= 10)
# Making prospective predictors out of them, with, for each,
# the number of occurrences in each tweet.
l <- length(buffer)
n <- length(output$n)
block_mentions <-
data.frame(matrix(l * n, nrow = l, ncol = n) * 1)
for (i in 1:n) {
block_mentions[, i] <- str_count(buffer, output$mention[i])
}
# Making column names R friendly.
names(block_mentions) <- make.names(output$mention, unique = TRUE)
# Adding the general total mentions_count to the block.
block_mentions <- cbind(block_mentions, mentions_count)
# Keeping for later use.
top_mentions <- output$mention
rm(buffer, l, n, output, i)2 paths will be followed:
# Keeping just tweets.
buffer <- train_utf8_no_urls$text
# The first prospective predictor already exists: it is the number of
# hashtag occurrences in each tweet, which is recorded in hashtags_count.
# Other prospective predictors from hashtags having at least
# 10 occurrences will be added.
# Length of existing hashtag vector.
l <- length(hashtags)
# Receptacle data frame with number of occurrences for each hashtag
# in all tweets together, to pick up the top hashtags.
output <- data.frame(hashtag = hashtags, n = 1:l)
for (i in 1:l) {
output$n[[i]] <- sum(str_count(buffer, hashtags[i]))
}
# Keeping hashtags with at least 10 occurrences.
output <- output %>% filter(n >= 10)
# Making prospective predictors out of them, with, for each,
# the number of occurrences in each tweet.
l <- length(buffer)
n <- length(output$n)
block_hashtags <-
data.frame(matrix(l * n, nrow = l, ncol = n) * 1)
for (i in 1:n) {
block_hashtags[, i] <- str_count(buffer, output$hashtag[i])
}
# Making column names R friendly.
names(block_hashtags) <- make.names(output$hashtag, unique = TRUE)
# Adding the general total hashtags_count to the block.
block_hashtags <- cbind(block_hashtags, hashtags_count)
# Assembling prospective predictors from URLs, mentions, and hashtags.
block_entities <-
cbind(block_urls, block_mentions, block_hashtags)
# Keeping for later use.
top_hashtags <- output$hashtag
rm(buffer, l, n, output, i)
rm(block_urls, block_mentions, block_hashtags)After Data Profiling, Data Wrangling, Exploratory Data Analysis, and Prospective Predictor Building, we are ready to run a Machine Learning model on the training set.
The ML algorithm chosen is eXtreme Gradient Boosting Tree, trained by the train() function from the caret metapackage. XGB Tree will be run on five sets of values for the tuning parameters and with the accuracy metric.
# In the function train() from the package caret, the target
# variable needs to be a numerical or factor vector. It will
# be made numerical and then factor with 1 representing
# the iPhone (the main class) and 2 representing the Android device.
target <- train_no_utf8$device
target <- str_replace_all(target, "Android", "2")
target <- str_replace_all(target, "iPhone", "1")
target <- as.factor(target)
# Keeping 3 versions of tweets for later results decoding
# and removing data frames for reasons of RAM management.
memo <-
data.frame(tweets_no_utf8 = train_no_utf8$text,
tweets_utf8 = train_utf8$text,
tweets_utf8_no_urls =
train_utf8_no_urls$text,
tweets_utf8_no_urls_mentions_hashtags = train_utf8_no_urls_mentions_hashtags$text)
rm(train_no_utf8,
train_utf8,
train_utf8_no_urls,
train_utf8_no_urls_mentions_hashtags)
# Running algorithm xgbTree.
set.seed(1)
attribution_model <- train(block_stylo, target,
method = "xgbTree",
tuneLength = 5,
metric = "Accuracy")
# Predicting on training set.
pred <- predict(attribution_model)
# Accuracy
acc <- mean(pred == target)
# Table with accuracy
tab <- data.frame(acc) %>%
`colnames<-`("Tweet Attribution Model on Training Set - 1st Run") %>%
`rownames<-`("Accuracy")
# Prints table with bg-primary layout.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Tweet Attribution Model on Training Set - 1st Run | |
|---|---|
| Accuracy | 0.9160653 |
Running the algorithm eXtreme Gradient Boosting Tree on the training set delivers an accuracy level of 92 %.
This accuracy level has been reached only through stylometry, on very short texts. This makes the result even more noticeable.
Nevertheless, attempts will be done to better the result towards a higher accuracy level. Two research avenues will be followed:
This analysis will hopefully deliver insights about prediction errors and about additional prospective predictors or alternative prospective predictors.
False negatives will be shown in the table below.
# Detecting false negatives and printing corresponding tweets.
# Index of all wrong predictions
index <- which(! pred == target)
# Table of all wrong predictions
tab <- data.frame(text = memo$tweets_no_utf8, device = target)
tab <- tab[index, ]
# Table of false negatives
tab_fn <- tab %>%
filter(device == 1) %>%
select(text) %>%
`colnames<-`("False Negative: iPhone Tweet Wrongly Attributed")
# Creating an interactive data table, using the DT package.
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#999999',
row.style.color = 'white';",
"}",
"}
")
# Prints the interactive datatable with the JavaScript extensions.
datatable(tab_fn, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab_fn) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))Now the false positives: the Android tweets wrongly attributed to the iPhone.
# Table of false positives
tab_fp <- tab %>%
filter(device == 2) %>%
select(text) %>%
`colnames<-`("False Positive: Android Tweet Wrongly Attributed")
# Creating an interactive data table, using the DT package.
# This JavaScript extension will color/background color the header.
initComplete <-
c("function(settings, json) {
",
"$(this.api().table().header()).css({
'background-color': '#0072B2', 'color': 'White'
});",
"}
")
# This JavaScript extension will color/background color the body text.
rowCallback <-
c("function(row, data, index, rowId) {
",
"console.log(rowId)",
"if(rowId >= 0) {
",
"row.style.backgroundColor = '#0072B2',
row.style.color = 'white';",
"}",
"}
")
# Prints the interactive datatable with the JavaScript extensions.
datatable(tab_fp, rownames = FALSE, filter = "top",
options =
list(pageLength = 5, scrollX = T,
columnDefs = list(list(className = 'dt-center',
targets = 0:(ncol(tab_fp) - 1))),
initComplete = JS(initComplete),
rowCallback = JS(rowCallback)))As shown in the two tables above, there are much more false negatives than false positives. In other words, wrong attribution has mainly struck the tweets that have actually been sent by the iPhone.
Attention has been paid to both false negatives and false positives, but especially to false negatives since they are much more numerous. Insights have been looked for towards additional or alternative patterns to better detect the sending device.
While searching additional or alternative patterns, for each prospective pattern, four questions have been asked and answered:
Moreover, knowing that the predictive power of a prospective predictor depends also on interactivity with the other prospective predictors and not only on relationships between the prospective predictor and the target variable, the attributive model has been rerun several times with different versions of the additional or alternative prospective predictors. For reasons of brevity, this is not reported here. In this context, let’s mention a serendipitous improvement: at this stage of the project, it has been noticed that ellipses represented by a single Unicode character U+2026 had been forgotten, probably due to visual similarity with sequences of three separate dots in a row; introducing them when running the attributive model has led to an accuracy level somewhat lower. Consequently, they have been definitively dropped, leading to a small improvement thanks to serendipity … and some work as well.
The new patterns that have been chosen at the end of the analysis of false negatives and false positives are the following ones:
Moreover, three regroupings have been processed because occurrence numbers were very limited:
All coding details are available, as always, in the next code chunk.
# Objects from previous code chunk are removed here and not
# at the end of previous code chunk after printing in order to
# avoid an additional tag in the HTML document.
rm(index, tab, tab_fn, tab_fp)
# Preparing a version of tweets not only without URLs, mentions,
# and hashtags, but also without their placeholders.
text <-
str_replace_all(memo$tweets_utf8_no_urls_mentions_hashtags,
"URLPLACEHOLDER|MENTIONPLACEHOLDER|HASHTAGPLACEHOLDER", "")
# Here are the additional prospective predictors.
w_slash <- str_count(text, "w/|\\sw\\s")
r_apostrophe_s <- str_count(text, "R's")
re_colon <- str_count(text, "re:")
digit_slash_digit_within_par <- str_count(text, "\\(\\d/\\d\\)")
digits_ampm <- str_count(text, "[^\\d:]\\dam|[^\\d:]\\dpm")
words_lowercased <- str_count(text, "[:lower:]{2,}")
MAGA <- str_count(text, "MAKE AMERICA GREAT AGAIN")
GOP <- str_count(text, "GOP")
CNN <- str_count(text, "CNN")
yrs <- str_count(text, "yrs")
# Assembling them into a new block.
block_addendum <-
cbind(words_lowercased, MAGA, GOP, CNN, w_slash,
r_apostrophe_s, yrs, digit_slash_digit_within_par,
re_colon, digits_ampm)
rm(words_lowercased, MAGA, GOP, CNN, w_slash,
r_apostrophe_s, yrs, digit_slash_digit_within_par,
re_colon, digits_ampm)
# Adding the new block to the existing one.
block_stylo <- cbind(block_stylo, block_addendum)
rm(block_addendum)
# First regrouping
block_stylo$dash <- block_stylo$endash + block_stylo$emdash
block_stylo$endash <- NULL
block_stylo$emdash <- NULL
# Second regrouping
block_stylo$ampm <- block_stylo$time_num_colon_ampm +
block_stylo$digits_ampm
block_stylo$time_num_colon_ampm <- NULL
block_stylo$digits_ampm <- NULL
# Third regrouping
block_stylo$top_urls <- block_stylo$`https://t.co/3KWOl20zMm` +
block_stylo$`https://t.co/3KWOl2ibaW` +
block_stylo$`https://t.co/ANvTcZqfOq` +
block_stylo$`https://t.co/PVB6QX7VpK`
block_stylo$`https://t.co/3KWOl20zMm` <- NULL
block_stylo$`https://t.co/3KWOl2ibaW` <- NULL
block_stylo$`https://t.co/ANvTcZqfOq` <- NULL
block_stylo$`https://t.co/PVB6QX7VpK` <- NULLThe attributive model has been run in a first iteration and has delivered an accuracy level of 92 %.
An analysis has been conducted about the false negatives and the false positives, that is to say the tweets that have been wrongly classified by the attributive model.
This analysis has led to a revision of the prospective predictors. A new version of predictor set has emerged with additional and alternative prospective predictors.
It is high time we ran the attributive model again, in a second iteration.
Here is the result in terms of accuracy level, our performance metric.
# Running algorithm xgbTree.
set.seed(1)
attribution_model <- train(block_stylo, target,
method = "xgbTree",
tuneLength = 5,
metric = "Accuracy")
# Predicting on training set.
pred <- predict(attribution_model)
# Accuracy
acc <- mean(pred == target)
# Table with accuracy
tab <- data.frame(acc) %>%
`colnames<-`("Tweet Attribution Model on Training Set - 2nd Run") %>%
`rownames<-`("Accuracy")
# Prints table.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Tweet Attribution Model on Training Set - 2nd Run | |
|---|---|
| Accuracy | 0.9441701 |
Now, the attributive model reaches 94 % accuracy on the training set. This is deemed to be of valuable predictive quality in stylometry on very short texts.
There will be no further iteration on the training set.
The attributive model will be run on the validation set.
In this part, XGBoost Tree will be run on the validation set.
The validation set has first to be constructed. The row index of the validation set will be applied to the whole dataset to obtain the validation set.
data("trump_tweets")
tweets <- trump_tweets
rm(trump_tweets)
# The device names are kept only for both devices
# we are interested in. Moreover, they are simplified.
# The tweets are kept only if they were sent between the day
# Candidate Donald Trump announced his campaign and election day.
temporary_data_set <- tweets %>%
mutate(device = str_replace_all(
str_replace_all(source, "Twitter for Android", "Android"),
"Twitter for iPhone", "iPhone")) %>%
filter(device %in% c("Android", "iPhone") &
created_at >= ymd("2015-06-17") &
created_at < ymd("2016-11-08")) %>%
select(- source)
rm(tweets)
# Creating the validation set without conversion to UTF-8.
ind_val <- read_csv("ind_val.csv") %>%
as.data.frame() %>%
`colnames<-`("index") %>%
.$index
val_no_utf8 <- temporary_data_set[ind_val, ]
# Creating the validation set with conversion to UTF-8 and
# furthermore replacing curly double quotes by straight ones.
val_utf8 <- val_no_utf8 %>%
mutate(text = sapply(text, utf8_normalize, map_quote = TRUE)) %>%
mutate(text = str_replace_all(text, "“|”", '"')) %>%
select(id_str, text, device)
rm(temporary_data_set, ind_val)Some additional datasets must be added, just as it was done for the training set.
# URLS
# In case 2 URLs might stick to each other, let's use buffer
# with empty space character before URLs when extracting URLs.
buffer <- str_replace_all(val_utf8$text, "http", " http")
# VECTOR OF URLS
urls_val <- str_extract_all(buffer, "https://\\S+|http://\\S+")
rm(buffer)
# Converts a list of lists into a character vector.
urls_val <- unlist(urls_val)
# Drops punctuation at the end of extractions.
urls_val <- str_replace_all(urls_val, "[:punct:]+$", "")
# Keeps only unique URLs.
urls_val <- unique(urls_val)
# URLS AS REGEX PATTERN FOR TEXT MINING
# Escape characters in front of question marks in URLs.
urls_with_question_marks_val <-
str_replace_all(urls_val, "\\?", "\\\\?")
# URLs assembled into a Regex pattern for Text Mining
urls_as_pattern_val <-
paste(urls_with_question_marks_val, "|", sep = "", collapse = "")
urls_as_pattern_val <-
str_replace(urls_as_pattern_val, "\\|$", "")
# TWEETS WITH URL PLACEHOLDER
buffer <- val_utf8$text
for (i in 1:length(urls_val)) {
buffer <-
str_replace_all(buffer,
urls_with_question_marks_val[i],
"URLPLACEHOLDER")
}
val_utf8_no_urls <-
val_utf8 %>%
select(id_str, device) %>%
mutate(text = buffer) %>%
select(id_str, text, device)
rm(i, buffer)
# NUMBER OF URL OCCURRENCES PER TWEET
urls_count_val <- str_count(val_utf8$text, urls_as_pattern_val)
# It could also be done with
# str_count(val_utf8_no_urls$text, "URLPLACEHOLDER")
# VECTOR WITH ALL MENTIONS
buffer <- val_utf8_no_urls$text
mentions_val <- str_extract_all(buffer, "@\\w+")
mentions_val <- unique(unlist(mentions_val))
# MENTIONS AS REGEX PATTERN FOR TEXT MINING
mentions_as_pattern_val <-
paste(mentions_val, "|", sep = "", collapse = "")
mentions_as_pattern_val <-
str_replace(mentions_as_pattern_val, "\\|$", "")
# NUMBER OF MENTION OCCURRENCES PER TWEET
mentions_count_val <- str_count(buffer, mentions_as_pattern_val)
# VECTOR OF ALL HASHTAGS
hashtags_val <- str_extract_all(buffer, "#\\w+")
hashtags_val <- unique(unlist(hashtags_val))
# Discards strings #1 and #2 that are followed neither
# with letters, nor with digits, nor with underscore marks.
# Discards #1for .
# #'s is discarded by code above, which accepts
# only underscore as punctuation mark after #.
index1 <- str_detect(hashtags_val, "#1\\s|#1(?!_)[:punct:]|#1$")
index1 <- which(index1 ==TRUE)
index2 <- str_detect(hashtags_val, "#2\\s|#2(?!_)[:punct:]|#2$")
index2 <- which(index2 ==TRUE)
index3 <- str_detect(hashtags_val, "#1for")
index3 <- which(index3 ==TRUE)
index <- as.integer(c(index1, index2, index3))
index <- setdiff(1:length(hashtags_val), index)
hashtags_val <- hashtags_val[index]
rm(index, index1, index2, index3)
# HASHTAGS AS REGEX PATTERN FOR TEXT MINING
hashtags_as_pattern_val <-
paste(hashtags_val, "|", sep = "", collapse = "")
hashtags_as_pattern_val <-
str_replace(hashtags_as_pattern_val, "\\|$", "")
# NUMBER OF HASHTAG OCCURRENCES PER TWEET
hashtags_count_val <- str_count(buffer, hashtags_as_pattern_val)
# TWEETS WITH NEITHER URLS NOR MENTIONS NOR HASHTAGS
new <- str_replace_all(buffer,
mentions_as_pattern_val,
"MENTIONPLACEHOLDER")
new <- str_replace_all(new,
hashtags_as_pattern_val,
"HASHTAGPLACEHOLDER")
val_utf8_no_urls_mentions_hashtags <-
val_utf8_no_urls %>%
mutate(text = new)
rm(urls_with_question_marks_val, buffer, new)The validation predictors have to mimic the training set predictors. Consequently, the building process of the validation predictors is a duplicate of the building process of the training set predictors, of course except for some names.
So, it has seemed useless — and even tedious — to replicate all comments that have been made while building up the training set predictors. The presentation of how validation predictors are built up will simply be a very factual one, with some minimal presentation and code chunks.
Just as in the building process of the training set predictors, the first validation predictor is the emoji predictor, which will contain, for each tweet, the occurrence number of emojis in that tweet.
emoji_predictor <- ji_count(val_utf8$text)
# This predictor will join the punctuation block of validation
# predictors later on. It has to bear the same name as
# the counterpart in the block of training predictors. Among validation punctuation predictors, predictors based on single punctuation will be developed first. There are 17 of them. For each predictor, the occurrence number of the corresponding punctuation mark will be counted in each tweet.
single_punctuation_marks <-
c(".", "?", "!", ",", ":", "/", "+", "=", "$",
"_", "-", "–", "—", "(", ")", '"', "'")
column_names_single_punctuation <-
c("dot", "quest_mark", "ex_mark", "comma",
"colon", "slash", "plus", "equal", "dollar",
"underscore", "hyphen", "endash", "emdash",
"left_par", "right_par",
"doub_quot", "apostrophe_single_quote")
# Vector of multiple punctuation marks that will be first
# discarded before counting single punctuation marks.
to_be_discarded <-
c("\\.", "\\?", "!", ",", ":", "/", "\\+", "=", "\\$",
"_", "-", "–", "—", "\\(", "\\)", '\\"', "\\'")
to_be_discarded <-
paste(to_be_discarded, "{2,}", "|", sep = "", collapse = "")
# Are discarded as well two special sequences: numbers followed
# by a percent sign and & both will be treated
# with other special sequences below.
to_be_discarded <-
paste(to_be_discarded, "\\d+%|&", sep = "", collapse = "")
# Discards multiple punctuation marks from tweets
# just for counting single punctuation mark occurrences.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, to_be_discarded, "")
# Constructing a data frame as receptacle for single punctuation marks.
l <- length(single_punctuation_marks)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_punctuation_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_single_punctuation)
# Counting occurrences of single punctuation marks.
for (i in 1:length(single_punctuation_marks)) {
block_punctuation_val[, i] <-
str_count(buffer, fixed(single_punctuation_marks[i]))
}
rm(single_punctuation_marks, column_names_single_punctuation,
to_be_discarded, buffer, l, n)
# Adding the emoji predictor.
block_punctuation_val$emojis_count <- emoji_predictor
rm(emoji_predictor)There are 4 validation predictors related to repetitive punctuation.
column_names_repetitive_punctuation <-
c("doub_hyphen", "trip_hyphen", "ellipsis", "quad_dot")
# Constructing a data frame as receptacle for repetitive
# punctuation mark predictors.
l <- length(column_names_repetitive_punctuation)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_repetitive_punctuation_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_repetitive_punctuation)
# DOUBLE HYPHEN
# Discards triple (or more) hyphen to count double hyphens.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "-{3,}", "")
# Counting occurrences of double hyphen.
block_repetitive_punctuation_val[, 1] <- str_count(buffer, "--")
# COUNTING TRIPLE HYPHEN AND ELLIPSIS
# Discards quadruple hyphens or dots to count triple hyphens or ellipses.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "-{4,}|\\.{4,}", "")
# Counting occurrences of triple hyphen.
block_repetitive_punctuation_val[, 2] <- str_count(buffer, "---")
# Counting occurrences of ellipses.
block_repetitive_punctuation_val[, 3] <- str_count(buffer, "\\.{3}")
# COUNTING QUADRUPLE DOT
# Discards quintuple dot for counting quadruple dot.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_replace_all(buffer, "\\.{5,}", "")
# Counting occurrences of quadruple dot.
block_repetitive_punctuation_val[, 4] <- str_count(buffer, "----")
# Joining with existing block_punctuation predictors.
block_punctuation_val <-
cbind(block_punctuation_val, block_repetitive_punctuation_val)
rm(column_names_repetitive_punctuation,
block_repetitive_punctuation_val, buffer, l, n)The validation predictor relating to single quotes will be built by excluding the mark ’ wherever it is supposed to be an apostrophe, just as in short forms and in most possessive forms. A special rule — called no odd number of single quotes — will be applied to hopefully eradicate the few apostrophes that might remain. Then the occurrence number of single quotes can be counted in each tweet.
The validation predictor referring to apostrophes will be built up for each tweet by deducting the occurrence count of single quotes from the occurrence number of the mark ’ in that tweet.
The building process is fully documented in the code chunk below.
# Single quotes will be counted first.
# Tweets will be lowercased since short forms need identifying
# and short forms are already lowercased in the package stopwords.
text <- val_utf8_no_urls_mentions_hashtags$text
text <- str_to_lower(text, locale = "en")
# Let's remove apostrophes from
# - the possessive forms 's,
# - plural forms such as $'s,
# - enclosed apostrophes in short forms or
# - in e.g. some family names like O'Reilly,
# - apostrophes in abbreviated two-digit format for years,
# - trailing apostrophes at the end of two colloquialisms,
# i.e. "ya'" and "lyin'", used by the Android device,
# - apostrophes in #'s and $'s .
# By the way, we do not eliminate apostrophes from
# the possessive forms s' because at the same time
# we would eliminate single quotation marks.
# We'll tackle that later on, in a completely different way.
text <- str_replace_all(text, "([^\\s])(\\')(s)", "\\1\\3")
text <- str_replace_all(text,
"([A-Za-z])(\\')([A-Za-z])", "\\1\\3")
text <- str_replace_all(text, "(\\')(\\d{2})", "\\2")
text <- str_replace_all(text, "(ya)(\\')", "\\1")
text <- str_replace_all(text, "(lyin)(\\')", "\\1")
text <- str_replace_all(text, "(#)(\\')(s)", "\\1\\3")
text <- str_replace_all(text, "(\\$)(\\')(s)", "\\1\\3")
# There can still remain apostrophes, probably in limited number.
# The number of single quotes should be an even number. If there
# remains 1 apostrophe among them in a tweet, then the number
# of presumed single quotes is an odd number: must be decreased by 1.
# Of course, this wouldn't solve the problem if there were 2 or 3
# apostrophes left in one tweet; this is not highly probable and
# has already been checked as not happening in the training set.
single_quote <- str_count(text, "\\'")
for (i in 1:nrow(val_utf8_no_urls_mentions_hashtags)) {
# If the number of ' is an odd number ...
if ((single_quote[i] %% 2) > 0) {
# ... then the number is reduced by 1.
single_quote[[i]] <- single_quote[i] - 1
}
}
# single_quot will join the predictors from block_punctuation_val
# and will be deducted from the predictor
# "apostrophe_single_quote" to give "apostrophe".
block_punctuation_val$single_quote <- single_quote
block_punctuation_val$apostrophe <-
block_punctuation_val$apostrophe_single_quote - single_quote
block_punctuation_val$apostrophe_single_quote <- NULL
rm(text, i, single_quote)A validation predictor will be added, containing the occurrence number of curly quotes and apostrophes in each tweet. Detailed documentation can be found in the sections EDA and Stylometric Predictors.
temp <-
val_no_utf8 %>%
mutate(text = str_replace_all(text,
urls_as_pattern_val,
"URLPLACEHOLDER")) %>%
.$text
block_punctuation_val$curliewurlies_count <-
str_count(temp, "’|‘|“|”")
rm(temp)There are 5 validation predictors linked to enclosed single punctuation marks. All documentation is included in the code chunk below, notably the look-around code.
# Selecting frequently enclosed punctuation marks.
single_enclosed_punctuation <-
c("\\.", "-", ":", ",", "/")
# Naming them.
column_names_single_enclosed_punctuation <-
c("dot_enc", "enc_hyphen", "enc_colon", "enc_comma",
"enc_slash")
# Constructing a receptacle data frame for occurrence counts.
l <- length(single_enclosed_punctuation)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_single_enclosed_punctuation_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_single_enclosed_punctuation)
# Counting occurrences of top enclosed punctuation marks.
for (i in 1:length(single_enclosed_punctuation)) {
# The following pattern selects enclosed single punctuation
# marks but no enclosed repetitions of punctuation marks.
pattern <-
paste("(?![:punct:])\\S", single_enclosed_punctuation[i],
"(?![:punct:])\\S", sep = "")
block_single_enclosed_punctuation_val[, i] <-
str_count(val_utf8_no_urls_mentions_hashtags$text,
pattern)
}
# Adding predictors from enclosed punctuation to block of punctuation.
block_punctuation_val <-
cbind(block_punctuation_val, block_single_enclosed_punctuation_val)
# Deducting occurrence numbers of enclosed single punctuation
# from the total occurrence numbers of the same punctuation mark.
block_punctuation_val$comma <-
block_punctuation_val$comma - block_punctuation_val$enc_comma
block_punctuation_val$hyphen <-
block_punctuation_val$hyphen - block_punctuation_val$enc_hyphen
block_punctuation_val$colon <-
block_punctuation_val$colon - block_punctuation_val$enc_colon
block_punctuation_val$slash <-
block_punctuation_val$slash - block_punctuation_val$enc_slash
# Predictors related to dots will be adjusted later on
# with additional information.
rm(single_enclosed_punctuation,
column_names_single_enclosed_punctuation,
block_single_enclosed_punctuation_val, l, n, i, pattern)There are 25 validation predictors based on special sequences. In the code chunk below are included so well documentation as code — among others the look-around code.
Some reshuffling of existing validation predictors is also realized.
special_sequences <-
c("\\\n",
"\\&",
"\\S\\)\\S",
"[^[:punct:]]![^[:punct:]]",
"[a-zA-Z]-\\s",
"\\s-\\s",
"[a-zA-Z]-[a-zA-Z]",
"[a-z]\\.\\s",
"^\\.",
"!$",
"\\?$",
"\\.$",
"[:alnum:]$",
"[\\s\\$]\\d{1,3},\\d{3},\\d{3}[\\s[:punct:][a-zA-Z]][^\\d]",
"\\d+\\.\\d+",
"\\d+[^\\.%]",
"\\sA.M.|\\sP.M.",
"\\d{1,2}\\:\\d{2}(am|pm)",
"[^\\d]\\d{1,2}/\\d{1,2}/(\\d{2}){1,2}[\\s[a-zA-Z][:punct:]]",
"\\'\\d{2}[\\s[:punct:]]",
"[\\s[:punct:]]U.S.[\\s[:punct:]]",
"U.S.A.",
"Lyin\\'|lyin\\'|LYIN\\'",
"Havn\\'t|havn\\'t|HAVN\\'T",
"^#1[^\\w]|#1[^\\w]|#1$")
column_names_special_sequences <-
c("newline",
"ampersand",
"spec_enc_right_par",
"ex_mar_wo_punct",
"semi_enc_hyphen",
"free_hyphen",
"compounded_hyphen",
"period",
"dot_upfront",
"tweet_end_ex_mark",
"tweet_end_quest_mark",
"dot_tweet_end",
"tweet_end_alphanum",
"thousand_sep_2",
"dot_decimal_part",
"percentual_number",
"A.M._or_P.M.",
"time_num_colon_ampm",
"date_num_slashes",
"abb_millenial",
"U.S.",
"U.S.A.",
"lyin_apostrophe",
"havnt_apostrophe",
"number_one")
# Constructing a receptacle data frame for predictors.
l <- length(special_sequences)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_special_sequences_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_special_sequences)
# Counting occurrences of each special sequence in each tweet.
for (i in 1:length(special_sequences)) {
block_special_sequences_val[, i] <-
str_count(val_utf8_no_urls_mentions_hashtags$text,
special_sequences[i])
}
# Adding special sequence predictors to block_punctuation_val.
block_punctuation_val <-
cbind(block_punctuation_val, block_special_sequences_val)
# Special sequence predictors will be subtracted
# from more general ones.
block_punctuation_val$quest_mark <-
block_punctuation_val$quest_mark -
block_punctuation_val$tweet_end_quest_mark
block_punctuation_val$apostrophe <-
block_punctuation_val$apostrophe -
block_punctuation_val$abb_millenial
# From the predictor right_par will, we will subtract the
# predictor spec_enc_right_par, which is specially enclosed
# on both sides in the sense that the surrounding characters can be
# any non-empty character including other punctuation marks, which
# has not been done for other enclosed punctuation marks.
block_punctuation_val$right_par <-
block_punctuation_val$right_par -
block_punctuation_val$spec_enc_right_par
# The predictor left_par, which contains the same information
# as right_par, will be discarded.
block_punctuation_val$left_par <- NULL
# Special sequences will be subtracted from enclosed ones.
block_punctuation_val$enc_comma <-
block_punctuation_val$enc_comma -
block_punctuation_val$thousand_sep_2
block_punctuation_val$enc_colon <-
block_punctuation_val$enc_colon -
block_punctuation_val$time_num_colon_ampm
block_punctuation_val$enc_slash <-
block_punctuation_val$enc_slash -
(2 * block_punctuation_val$date_num_slashes)
# Some predictors will be deleted because some special
# sequences provide more uneven occurrence breakdowns by device.
# First, the predictors dot and dot_enc are replaced
# by the predictors period, dot_upfront, dot_tweet_end,
# A.M._or_P.M., U.S., U.S.A., and dot_decimal_part.
block_punctuation_val$dot <- NULL
block_punctuation_val$dot_enc <- NULL
# Second, the predictors hyphen and enc_hyphen are
# replaced by the predictors semi_enc_hyphen, free_hyphen,
# and compounded_hyphen.
block_punctuation_val$hyphen <- NULL
block_punctuation_val$enc_hyphen <- NULL
# Third, the predictors ex_mark and enc_ex_mark are replaced
# by the predictors ex_mar_wo_punct and tweet_end_ex_mark.
block_punctuation_val$ex_mark <- NULL
block_punctuation_val$enc_ex_mark <- NULL
rm(special_sequences, column_names_special_sequences,
block_special_sequences_val, l, n, i)There is one validation predictor related to State abbreviations: for each tweet, the total of the occurrences of all State abbreviations in that tweet.
# List of abbreviations of American States without dots
state_abb_without_dots <- state.abb
# Replacing State abbreviations by placeholder.
buffer <-
val_utf8_no_urls_mentions_hashtags$text
for (i in 1:length(state_abb_without_dots)) {
# If State abbreviation is in front of tweet.
buffer <-
str_replace_all(buffer,
paste("(^", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"ABBPLACEHOLDER\\2")
# If State abbreviation is in the "middle" of tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
# If State abbreviation is at the end of tweet.
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
"$)", sep = ""),
"\\1ABBPLACEHOLDER")
# Second iteration for a State abbreviation in the "middle" of tweet
buffer <-
str_replace_all(buffer,
paste("([\\s[:punct:]])(", state_abb_without_dots[i],
")([\\s[:punct:]])", sep = ""),
"\\1ABBPLACEHOLDER\\3")
}
# Counting State abbreviations without dots.
State_abb <- str_count(buffer, "ABBPLACEHOLDER")
# Adding count to block_punctuation.
block_punctuation_val$State_abb <- State_abb
rm(state_abb_without_dots, buffer, i, State_abb)There is a similar validation predictor based on all abbreviations that are uppercased and contain dots.
Moreover, all other dot cases will be fully reviewed and reshuffled, just as for the training set. Documentation and code are included in the code chunk below.
# Extracting all abbreviations uppercased and with dots.
# The Regex pattern looks for abbreviations at the beginning
# of tweets or elsewhere in tweets.
blub <-
str_extract_all(val_utf8_no_urls_mentions_hashtags$text,
"^([:upper:]\\.){2,}|[^[:alpha:]]([:upper:]\\.){2,}")
# Unlisting & collapsing each row in order to get a vector
# of abbreviations for each row.
for (i in 1:length(blub)) {
blub[[i]] <- paste(unlist(blub[i]), sep = "", collapse = " ")
}
# Eliminating a possible dot in front of an abbreviation.
blub <- str_replace_all(blub, "(\\s)(\\.)([:upper:])", "\\1,\\3")
# Counts the number of dots in abbreviations in each row.
block_punctuation_val$dot_abb <- str_count(blub, "\\.")
# Expressing the predictor block_punctuation$A.M._or_P.M.
# in occurrences of dots instead of occurrences of sequences
# in order to subtract it from block_punctuation$dot_abb.
# block_punctuation$A.M._or_P.M. is kept just as in the training set.
block_punctuation_val$A.M._or_P.M. <-
2 * block_punctuation_val$A.M._or_P.M.
block_punctuation_val$dot_abb <-
block_punctuation_val$dot_abb - block_punctuation_val$A.M._or_P.M.
rm(blub, i)
# In the rest of this code chunk, all other dot cases will be reviewed.
# In order to avoid double counting with other dot predictors,
# abbreviations will be neutralized by being replaced by placeholders.
# First, neutralizing abbreviations with dots in the "middle" or
# at the end of tweets. Abbreviations are replaced by a placeholder
# followed by a comma in order to keep a punctuation mark and preserve
# preexisting relationships with surrounding characters.
buffer <-
str_replace_all(val_utf8_no_urls_mentions_hashtags$text,
"([^[:alpha:]])(([:upper:]\\.){2,})",
"\\1ABBPLACEHOLDER,")
# Second, neutralizing abbreviations with dots in front of tweets.
buffer <- str_replace_all(buffer,
"^([:upper:]\\.){2,}",
"ABBPLACEHOLDER,")
# This buffer is a starting point to rationalize the whole set of
# predictors related to dots.
# Some previous predictors will be dropped because they
# have been implicitly regrouped in dot_abb .
block_punctuation_val$U.S. <- NULL
block_punctuation_val$U.S.A. <- NULL
# Registers repetitions of dots and discards them from buffer.
block_punctuation_val$dot_three_plus <-
str_count(buffer, "(\\.){3,}")
buffer <- str_replace_all(buffer, "(\\.){3,}", ",")
buffer <- str_replace_all(buffer, "(\\.){2}", ",")
# Removes block_punctuation$quad_dot which has been implicitly
# incorporated into block_punctuation$dot_three_plus.
block_punctuation_val$quad_dot <- NULL
# The rest from this code chunk mainly spots specific
# subgroups of enclosed dots with uneven occurrence breakdown by device.
# Dots enclosed between alphabetic characters.
block_punctuation_val$dot_enc_alpha <-
str_count(buffer, "[:alpha:]\\.[:alpha:]")
buffer <- str_replace_all(buffer,
"([:alpha:])(\\.)([:alpha:])",
"\\1,\\3")
block_punctuation_val$dot_enc <- NULL
# Dots enclosed between digits.
block_punctuation_val$dot_decimal_part <-
str_count(buffer, "\\d\\.\\d")
buffer <- str_replace_all(buffer, "(\\d)(\\.)(\\d)", "\\1,\\3")
# Dots enclosed between letter and double quotation marks.
block_punctuation_val$dot_enc_alpha_doub_quot <-
str_count(buffer, "[:alpha:]\\.\"")
buffer <- str_replace_all(buffer, "([:alpha:])(\\.)(\")", "\\1,\\3")
# Dots enclosed in other combinations are just removed.
buffer <- str_replace_all(buffer, "(\\S)(\\.)(\\S)", "\\1,\\3")
# Dots left enclosed between hashtags and empty space.
block_punctuation_val$dot_left_enc_hashtag <-
str_count(buffer, "HASHTAGPLACEHOLDER\\.")
buffer <- str_replace_all(buffer,
"HASHTAGPLACEHOLDER\\.",
"HASHTAGPLACEHOLDER,")
# Dots left enclosed between mentions and empty space.
block_punctuation_val$dot_left_enc_mention <-
str_count(buffer, "MENTIONPLACEHOLDER\\.")
buffer <- str_replace_all(buffer,
"MENTIONPLACEHOLDER\\.",
"MENTIONPLACEHOLDER,")
# Dots left enclosed between URLs and empty space are just removed.
buffer <- str_replace_all(buffer,
"URLPLACEHOLDER\\.",
"URLPLACEHOLDER,")
# Recalculates number of dots at the end of tweets.
block_punctuation_val$dot_tweet_end <- str_count(buffer, "\\.$")
buffer <- str_replace_all(buffer, "\\.$", ",")
# Recalculates number of dots at the beginning of tweets.
block_punctuation_val$dot_upfront <- str_count(buffer, "^\\.")
buffer <- str_replace_all(buffer, "^\\.", ",")
# Dots left enclosed between upper letter and empty space.
block_punctuation_val$dot_left_enc_upper <-
str_count(buffer, "[:upper:]\\.\\s")
# Dots left enclosed between lower letter or digit and empty space.
block_punctuation_val$dot_left_enc_lower_digit <-
str_count(buffer, "[[:lower:]\\d]\\.\\s")
# Dots left enclosed between punctuation and empty space.
block_punctuation_val$dot_left_enc_punct <-
str_count(buffer, "[:punct:]\\.\\s")
# The remaining dots are not incorporated in any predictor.
# Previous predictor is out because of redundancy.
block_punctuation_val$dot <- NULLValidation predictors are added for each short form that was used at least 10 times in the training set. Indeed, the predictors must be the same for the training set and for the validation set when predicting on the validation set with the ML attributive model.
# Short forms will be added as predictors if and only if
# they were training set predictors.
# In order to facilitate text mining, let's lowercase
# the tweets since short forms are already lowercased.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Only some short forms have been used on the training set.
# The list of them has been kept under the name top_short_forms.
# The same short forms have to be used on the validation set.
column_names_top_short_forms <-
str_replace_all(top_short_forms, "\\'", "_")
# Data frame as receptacle for top short form predictors
l <- length(top_short_forms)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_top_short_forms_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_top_short_forms)
# Counting occurrences of top short forms.
for (i in 1:l) {
block_top_short_forms_val[, i] <-
str_count(val_utf8_no_urls_mentions_hashtags$text,
top_short_forms[i])
}
rm(buffer, column_names_top_short_forms, l, n, i)The same holds for stopwords other than short forms.
# Other stopwords will be added as predictors if and only if
# they were training set predictors.
# In order to facilitate text mining, let's lowercase
# the tweets since short forms are already lowercased.
buffer <- val_utf8_no_urls_mentions_hashtags$text
buffer <- str_to_lower(buffer, locale = "en")
# Among other stopwords, only some have been used on the training set.
# The list of them has been kept under the name top_other_stopwords.
# The same short forms have to be used on the validation set.
column_names_top_other_stopwords <-
str_replace_all(top_other_stopwords, "\\'", "_")
# Data frame as receptacle for top other stopword predictors
l <- length(top_other_stopwords)
n <- nrow(val_utf8_no_urls_mentions_hashtags)
block_top_other_stopwords_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(column_names_top_other_stopwords)
# Counting occurrences of top other stopwords.
for (i in 1:l) {
block_top_other_stopwords_val[, i] <-
str_count(buffer,
paste("^", top_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]]", top_other_stopwords[i], "[^[:alnum:]-]",
"|[^[:alnum:]]", top_other_stopwords[i], "\\$",
sep = ""))
}
rm(column_names_top_other_stopwords, l, n, i)
# Building up one single block for all stopwords.
block_stopwords_val <-
cbind(block_top_short_forms_val, block_top_other_stopwords_val)
rm(block_top_short_forms_val, block_top_other_stopwords_val)There are 5 validation predictors based on URLs:
# Keeping just tweets.
buffer <- val_utf8$text
# First predictor: total number of URLs in each tweet
# already exists in urls_count_val.
# Only some short forms have been used on the training set.
# The list of them has been kept under the name top_urls.
# The same short forms have to be used on the validation set.
l <- length(top_urls)
n <- length(buffer)
block_urls_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1) %>%
`colnames<-`(top_urls)
for (i in 1:l) {
block_urls_val[, i] <- str_count(buffer, top_urls[i])
}
# Adding the general total urls_count_val. It must have
# the same name as the equivalent in block_stylo (training set)
# for Machine Learning.
urls_count <- urls_count_val
block_urls_val <- cbind(block_urls_val, urls_count)
rm(buffer, i, l, n, urls_count, urls_count_val)There are two types of mention validation predictors:
# Keeping just tweets.
buffer <- val_utf8_no_urls$text
# Only some mentions have been used on the training set.
# The list of them has been kept under the name top_mentions.
# The same short forms have to be used on the validation set.
l <- length(top_mentions)
n <- length(buffer)
block_mentions_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1)
for (i in 1:l) {
block_mentions_val[, i] <- str_count(buffer, top_mentions[i])
}
# Making column names R friendly.
names(block_mentions_val) <-
make.names(top_mentions, unique = TRUE)
# Adding the general total mentions_count_val. It must have
# the same name as the equivalent in block_stylo (training set)
# for Machine Learning.
mentions_count <- mentions_count_val
block_mentions_val <- cbind(block_mentions_val, mentions_count)
rm(buffer, l, n, i, mentions_count, mentions_count_val)There are two types of hashtag validation predictors:
# Keeping just tweets.
buffer <- val_utf8_no_urls$text
# First predictor: total number of tweets per tweet
# already exists in hashtags_count_val.
# Only some hashtags have been used on the training set.
# The list of them has been kept under the name top_hashtags.
# The same hashtags have to be used on the validation set.
l <- length(top_hashtags)
n <- length(buffer)
block_hashtags_val <-
data.frame(matrix(n * l, nrow = n, ncol = l) * 1)
for (i in 1:l) {
block_hashtags_val[, i] <-
str_count(buffer, top_hashtags[i])
}
# Making column names R friendly.
names(block_hashtags_val) <-
make.names(top_hashtags, unique = TRUE)
# Adding the general total hashtags_count_val. It must have
# the same name as the equivalent in block_stylo (training set)
# for machine learning.
hashtags_count <- hashtags_count_val
block_hashtags_val <- cbind(block_hashtags_val, hashtags_count)
rm(buffer, l, n, i, hashtags_count, hashtags_count_val)
# Assembling predictors from URLs, mentions, and hashtags.
block_entities_val <-
cbind(block_urls_val, block_mentions_val, block_hashtags_val)
rm(block_urls_val, block_mentions_val, block_hashtags_val)All validation predictors that have been described up to this point have been copied from the training set predictors used when running the attributive model on the training set in the first iteration.
block_stylo_val <-
cbind(block_punctuation_val, block_stopwords_val, block_entities_val)
rm(block_punctuation_val, block_stopwords_val, block_entities_val)For the second predictive iteration on the training set, there were some additional or alternative predictors. These are mimicked here for the validation set.
# Preparing a version of tweets not only without URLs, mentions,
# and hashtags, but also without their placeholders.
text <-
str_replace_all(val_utf8_no_urls_mentions_hashtags$text,
"URLPLACEHOLDER|MENTIONPLACEHOLDER|HASHTAGPLACEHOLDER", "")
# Here are the additional prospective predictors.
w_slash <- str_count(text, "w/|\\sw\\s")
r_apostrophe_s <- str_count(text, "R's")
re_colon <- str_count(text, "re:")
digit_slash_digit_within_par <- str_count(text, "\\(\\d/\\d\\)")
digits_ampm <- str_count(text, "[^\\d:]\\dam|[^\\d:]\\dpm")
words_lowercased <- str_count(text, "[:lower:]{2,}")
MAGA <- str_count(text, "MAKE AMERICA GREAT AGAIN")
GOP <- str_count(text, "GOP")
CNN <- str_count(text, "CNN")
yrs <- str_count(text, "yrs")
# Assembling them into a new block.
block_addendum_val <-
cbind(words_lowercased, MAGA, GOP, CNN, w_slash,
r_apostrophe_s, yrs, digit_slash_digit_within_par,
re_colon, digits_ampm)
rm(words_lowercased, MAGA, GOP, CNN, w_slash,
r_apostrophe_s, yrs, digit_slash_digit_within_par,
re_colon, digits_ampm)
# Adding the new block to the existing one.
block_stylo_val <- cbind(block_stylo_val, block_addendum_val)
rm(block_addendum_val)
# First regrouping
block_stylo_val$dash <- block_stylo_val$endash + block_stylo_val$emdash
block_stylo_val$endash <- NULL
block_stylo_val$emdash <- NULL
# Second regrouping
block_stylo_val$ampm <- block_stylo_val$time_num_colon_ampm +
block_stylo_val$digits_ampm
block_stylo_val$time_num_colon_ampm <- NULL
block_stylo_val$digits_ampm <- NULL
# Third regrouping
block_stylo_val$top_urls <- block_stylo_val$`https://t.co/3KWOl20zMm` +
block_stylo_val$`https://t.co/3KWOl2ibaW` +
block_stylo_val$`https://t.co/ANvTcZqfOq` +
block_stylo_val$`https://t.co/PVB6QX7VpK`
block_stylo_val$`https://t.co/3KWOl20zMm` <- NULL
block_stylo_val$`https://t.co/3KWOl2ibaW` <- NULL
block_stylo_val$`https://t.co/ANvTcZqfOq` <- NULL
block_stylo_val$`https://t.co/PVB6QX7VpK` <- NULL
# Validation dataset will be removed The target variable
# will not be used in predicting but later in asserting
# prediction performance with 1 representing
# the iPhone (the main class) and 2 the Android device.
target_val <- val_no_utf8$device
target_val <- str_replace_all(target_val, "Android", "2")
target_val <- str_replace_all(target_val, "iPhone", "1")
target_val <- as.integer(target_val)Here is the accuracy level obtained on the validation set when running the attributive model.
# Predictions on the validation set with XGBoost Tree
pred_xgbTree_5 <-
predict(attribution_model, newdata = block_stylo_val)
# Accuracy on the validation set with XGBoost Tree
acc_xgbTree_5 <-
round(mean(pred_xgbTree_5 == target_val), 4)
# Table with accuracy
tab <- data.frame(acc_xgbTree_5) %>%
`colnames<-`("Tweet Attribution Model on the Validation Set") %>%
`rownames<-`("Accuracy")
# Prints table with bg-primary layout.
knitr::kable(tab, align = "c",
table.attr = "class=\'bg-primary\'") %>%
kableExtra::kable_styling()| Tweet Attribution Model on the Validation Set | |
|---|---|
| Accuracy | 0.918 |
This accuracy level seems excellent in a stylometric approach applied to very short texts.
It is hardly inferior to the level obtained on the training set in the second try. This could be interpreted as an indication that there was no, or almost no, overfitting on the training set.
In this project, tweet attribution has been conducted only through stylometry, with focus on emojis, single punctuation, multiple punctuation, enclosed punctuation, special sequences, abbreviations, function words — stopwords —, and entities — URLs, mentions, and hashtags. This stylometric project is a challenge because texts are very short.
In spite of that, the attributive model has delivered an accuracy level of 92 % on the training set in a first run thanks to long preparation through Data Profiling, Data Wrangling, Exploratory Data Analysis, and Predictor Building.
A thorough analysis of false negatives and false positives has been conducted. This has delivered insights as well as some additional and alternative predictors.
With this reshuffle, the accuracy level on the training set has been boosted to 94 % in a second iteration. After this improvement, no new iteration has been run on the training set.
On the validation set, the attributive model has reached 92 %. This is deemed of valuable predictive quality on very short texts.
This rather long drill down process has also showed that predictive power is sometimes in details, in modest tweet components, which are often discarded in case of tokenization. Among the 212 predictors, let’s remember just a few examples:
Other predictors would have been readily available, outside of a stylometric approach, such as timing, interaction — likes and retweets —, content words from tokenization, content words from expanding hashtags, and sentiments expressed by content words. These could be used on the same dataset in complementary approaches…
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C
## [5] LC_TIME=French_France.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] wordcloud2_0.2.3 emo_0.0.0.9000 githubinstall_0.2.2
## [4] states_0.3.1 xgboost_1.4.1.1 caret_6.0-88
## [7] lattice_0.20-41 plotly_4.9.4.1 DT_0.18
## [10] fastmap_1.1.0 sourcetools_0.1.7 xtable_1.8-4
## [13] httpuv_1.6.2 shiny_1.6.0 htmltools_0.5.2
## [16] devtools_2.4.2 usethis_2.0.1 stopwords_2.2
## [19] textdata_0.4.1 tidytext_0.3.1 quanteda_3.1.0
## [22] textreg_0.1.5 tm_0.7-8 NLP_0.2-1
## [25] utf8_1.2.2 gridExtra_2.3 kableExtra_1.3.4
## [28] knitr_1.33 ggthemes_4.2.4 lubridate_1.7.10
## [31] scales_1.1.1 forcats_0.5.1 stringr_1.4.0
## [34] dplyr_1.0.7 purrr_0.3.4 readr_2.0.1
## [37] tidyr_1.1.3 tibble_3.1.4 ggplot2_3.3.5
## [40] tidyverse_1.3.1 dslabs_0.7.4
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1 fastmatch_1.1-3
## [4] systemfonts_1.0.2 plyr_1.8.6 lazyeval_0.2.2
## [7] splines_4.0.3 crosstalk_1.1.1 SnowballC_0.7.0
## [10] digest_0.6.27 foreach_1.5.1 fansi_0.5.0
## [13] magrittr_2.0.1 memoise_2.0.0 tzdb_0.1.2
## [16] remotes_2.4.0 recipes_0.1.16 modelr_0.1.8
## [19] gower_0.2.2 RcppParallel_5.1.4 vroom_1.5.4
## [22] svglite_2.0.0 prettyunits_1.1.1 colorspace_2.0-2
## [25] rvest_1.0.1 haven_2.4.3 xfun_0.25
## [28] callr_3.7.0 crayon_1.4.1 jsonlite_1.7.2
## [31] survival_3.2-7 iterators_1.0.13 glue_1.4.2
## [34] gtable_0.3.0 ipred_0.9-11 webshot_0.5.2
## [37] pkgbuild_1.2.0 DBI_1.1.1 Rcpp_1.0.7
## [40] viridisLite_0.4.0 proxy_0.4-26 bit_4.0.4
## [43] stats4_4.0.3 lava_1.6.9 prodlim_2019.11.13
## [46] htmlwidgets_1.5.3 httr_1.4.2 ellipsis_0.3.2
## [49] pkgconfig_2.0.3 nnet_7.3-14 sass_0.4.0
## [52] dbplyr_2.1.1 labeling_0.4.2 tidyselect_1.1.1
## [55] rlang_0.4.11 reshape2_1.4.4 later_1.3.0
## [58] munsell_0.5.0 cellranger_1.1.0 tools_4.0.3
## [61] cachem_1.0.6 cli_3.0.1 generics_0.1.0
## [64] broom_0.7.9 evaluate_0.14 yaml_2.2.1
## [67] bit64_4.0.5 ModelMetrics_1.2.2.2 processx_3.5.2
## [70] fs_1.5.0 nlme_3.1-149 mime_0.11
## [73] slam_0.1-48 xml2_1.3.2 tokenizers_0.2.1
## [76] compiler_4.0.3 rstudioapi_0.13 curl_4.3.2
## [79] e1071_1.7-8 testthat_3.0.4 reprex_2.0.1
## [82] bslib_0.2.5.1 stringi_1.7.4 highr_0.9
## [85] ps_1.6.0 desc_1.3.0 Matrix_1.2-18
## [88] vctrs_0.3.8 pillar_1.6.2 lifecycle_1.0.0
## [91] jquerylib_0.1.4 data.table_1.14.0 R6_2.5.1
## [94] promises_1.2.0.1 janeaustenr_0.1.5 sessioninfo_1.1.1
## [97] codetools_0.2-16 MASS_7.3-53 assertthat_0.2.1
## [100] pkgload_1.2.1 rprojroot_2.0.2 withr_2.4.2
## [103] parallel_4.0.3 hms_1.1.0 grid_4.0.3
## [106] rpart_4.1-15 timeDate_3043.102 class_7.3-17
## [109] rmarkdown_2.10 pROC_1.17.0.1
Here are a few diverse references I have used or consulted.
https://journal.r-project.org/archive/2016/RJ-2016-007/index.html Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R: a package for computational text analysis. R Journal 8(1): 107-121.
http://pablobarbera.com/social-media-upf/code/01-text-intro.html
https://www.tidytextmining.com/
https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html
https://sicss.io/2018/materials/day3-text-analysis/basic-text-analysis/rmarkdown/
https://bookdown.org/rdpeng/rprogdatascience/regular-expressions.html
https://stackoverflow.com/questions/45828985/error-in-stri-detect-regex-in-r
https://www.petefreitag.com/cheatsheets/regex/character-classes/
https://cran.r-project.org/web/views/NaturalLanguageProcessing.html
https://stackoverflow.com/questions/13762868/how-do-i-extract-hashtags-from-tweets-in-r
https://cran.r-project.org/web/packages/textclean/textclean.pdf
https://stackoverflow.com/questions/29890303/case-insensitive-sort-of-vector-of-string-in-r
https://www.rdocumentation.org/packages/stopwords/versions/2.2
https://www.smashingmagazine.com/2016/11/character-sets-encoding-emoji/
https://rdrr.io/github/hadley/emo/man/ji_extract.html
https://www.thepunctuationguide.com/
https://jakubmarian.com/dot-period-full-stop-and-point-in-english/
https://www.grammarly.com/blog/hyphens-and-dashes/
https://blog.esllibrary.com/2014/09/24/punctuation-rules-parentheses/
https://practicaltypography.com/straight-and-curly-quotes.html
https://rdrr.io/cran/utf8/man/utf8_normalize.html
https://editorsmanual.com/articles/ellipsis/
https://en.wikipedia.org/wiki/Ellipsis
https://www.fileformat.info/info/unicode/char/2026/index.htm
https://cran.r-project.org/web/packages/xgboost/vignettes/discoverYourData.html#feature-importance
https://www.pnas.org/content/112/45/13892
https://www.tweetbinder.com/blog/trump-twitter/
https://www.kaggle.com/erikbruin/text-mining-the-clinton-and-trump-election-tweets
https://twitter.com/tvaziri/status/762005541388378112
http://varianceexplained.org/r/trump-tweets/
https://www.techwalla.com/articles/what-characters-are-allowed-in-a-twitter-name
https://advicemedia.com/blog/social-media/hashtags-101/
https://www.hashtags.org/featured/what-characters-can-a-hashtag-include/
http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
https://venngage.com/blog/color-blind-friendly-palette/#2
http://www.sthda.com/english/wiki/ggplot2-themes-and-background-colors-the-3-elements
https://ggplot2.tidyverse.org/reference/scale_manual.html
https://holtzy.github.io/Pimp-my-rmd/
https://rstudio.github.io/DT/options
https://rstudio.github.io/DT/010-style.html
https://rstudio.github.io/DT/functions.html
https://stackoverflow.com/questions/46853567/centering-plotly-output-to-html
https://stackoverflow.com/questions/25646333/code-chunk-font-size-in-rmarkdown-with-knitr-and-latex
https://bookdown.org/yihui/rmarkdown-cookbook/chunk-styling.html
https://datatables.net/forums/discussion/60924/how-to-remove-a-border
https://stackoverflow.com/questions/35749389/column-alignment-in-dt-datatable
https://plotly.com/r/hover-text-and-formatting/
https://stackoverflow.com/questions/49494121/plotly-change-hover-popup-styling
https://plotly.com/r/reference/#box-hoverlabel-bordercolorsrc
https://plotly.com/r/text-and-annotations/
https://www.littlemissdata.com/blog/wordclouds
https://cran.r-project.org/web/packages/wordcloud2/vignettes/wordcloud.html
https://www.r-graph-gallery.com/196-the-wordcloud2-library.html
https://www.learningrfordatascience.com/post/dynamic-wordclouds-with-wordcloud2/
https://stackoverflow.com/questions/51418946/how-to-align-column-title-and-content-in-knitr